423 lines
19 KiB
Python
423 lines
19 KiB
Python
|
|
"""
|
|||
|
|
predictive_model.py
|
|||
|
|
────────────────────
|
|||
|
|
Modélisation prédictive des flux nets avec walk-forward validation.
|
|||
|
|
|
|||
|
|
Ce module est intentionnellement séparé du feature engineering :
|
|||
|
|
il prend en entrée le dataset produit par FeatureBuilder et se concentre
|
|||
|
|
sur l'entraînement, la validation et l'interprétation des modèles.
|
|||
|
|
|
|||
|
|
Modèles implémentés :
|
|||
|
|
- Baseline : prédiction zéro (benchmark naïf)
|
|||
|
|
- Ridge : régression linéaire régularisée (interprétable)
|
|||
|
|
- RandomForest : non-linéaire, robuste aux outliers
|
|||
|
|
- GradientBoosting : état de l'art sur données tabulaires
|
|||
|
|
|
|||
|
|
Validation : walk-forward expanding window (pas de data leakage).
|
|||
|
|
|
|||
|
|
Usage :
|
|||
|
|
from feature_engineering import FeatureBuilder
|
|||
|
|
from predictive_model import WalkForwardModel
|
|||
|
|
|
|||
|
|
feature_cols = FeatureBuilder.get_feature_columns(dataset)
|
|||
|
|
model = WalkForwardModel()
|
|||
|
|
results = model.fit_evaluate(dataset, feature_cols)
|
|||
|
|
model.plot_results(results, output_path="results.png")
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
import matplotlib.pyplot as plt
|
|||
|
|
import matplotlib.gridspec as gridspec
|
|||
|
|
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
|||
|
|
from sklearn.linear_model import Ridge
|
|||
|
|
from sklearn.preprocessing import StandardScaler
|
|||
|
|
from sklearn.metrics import mean_absolute_error, r2_score
|
|||
|
|
from sklearn.inspection import permutation_importance
|
|||
|
|
import warnings
|
|||
|
|
warnings.filterwarnings("ignore")
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Constantes ────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
COLORS = ["#1f4e79", "#2e75b6", "#70ad47", "#ed7d31", "#a50026"]
|
|||
|
|
|
|||
|
|
MODEL_CONFIGS = {
|
|||
|
|
"Ridge": {
|
|||
|
|
"cls": Ridge,
|
|||
|
|
"kwargs": {"alpha": 1.0},
|
|||
|
|
"scale": True, # nécessite standardisation
|
|||
|
|
},
|
|||
|
|
"Random Forest": {
|
|||
|
|
"cls": RandomForestRegressor,
|
|||
|
|
"kwargs": {"n_estimators": 200, "max_depth": 6,
|
|||
|
|
"min_samples_leaf": 3, "random_state": 42, "n_jobs": -1},
|
|||
|
|
"scale": False,
|
|||
|
|
},
|
|||
|
|
"Gradient Boosting": {
|
|||
|
|
"cls": GradientBoostingRegressor,
|
|||
|
|
"kwargs": {"n_estimators": 200, "max_depth": 4,
|
|||
|
|
"learning_rate": 0.05, "subsample": 0.8,
|
|||
|
|
"random_state": 42},
|
|||
|
|
"scale": False,
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Classe principale ─────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
class WalkForwardModel:
|
|||
|
|
"""
|
|||
|
|
Entraîne et évalue plusieurs modèles via walk-forward validation.
|
|||
|
|
|
|||
|
|
Paramètres
|
|||
|
|
----------
|
|||
|
|
date_col : colonne de date dans le dataset (snapshots mensuels)
|
|||
|
|
target_col : colonne de la variable cible
|
|||
|
|
min_train_frac : fraction minimale de dates en train (défaut 0.4)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
def __init__(self,
|
|||
|
|
date_col: str = "Centralisation Date",
|
|||
|
|
target_col: str = "flux_net_proxy",
|
|||
|
|
min_train_frac: float = 0.4):
|
|||
|
|
self.date_col = date_col
|
|||
|
|
self.target_col = target_col
|
|||
|
|
self.min_train_frac = min_train_frac
|
|||
|
|
|
|||
|
|
# Stockage post-fit
|
|||
|
|
self.results_df_ = pd.DataFrame()
|
|||
|
|
self.importances_ = {}
|
|||
|
|
self.final_models_ = {}
|
|||
|
|
|
|||
|
|
# ── Walk-forward ──────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def fit_evaluate(self,
|
|||
|
|
dataset: pd.DataFrame,
|
|||
|
|
feature_cols: list[str],
|
|||
|
|
verbose: bool = True) -> pd.DataFrame:
|
|||
|
|
"""
|
|||
|
|
Exécute le walk-forward expanding window sur tous les modèles.
|
|||
|
|
|
|||
|
|
Retourne un DataFrame avec les métriques par (modèle, date de test).
|
|||
|
|
"""
|
|||
|
|
dataset = dataset.copy()
|
|||
|
|
dataset[self.date_col] = pd.to_datetime(dataset[self.date_col])
|
|||
|
|
|
|||
|
|
dates_sorted = sorted(dataset[self.date_col].unique())
|
|||
|
|
n_dates = len(dates_sorted)
|
|||
|
|
min_train = max(3, int(n_dates * self.min_train_frac))
|
|||
|
|
|
|||
|
|
if verbose:
|
|||
|
|
print(f"Walk-forward : {n_dates} dates | min train = {min_train}")
|
|||
|
|
|
|||
|
|
if n_dates <= min_train:
|
|||
|
|
print("⚠ Pas assez de dates pour le walk-forward. "
|
|||
|
|
"Augmenter la taille du dataset.")
|
|||
|
|
return pd.DataFrame()
|
|||
|
|
|
|||
|
|
records = []
|
|||
|
|
for test_idx in range(min_train, n_dates):
|
|||
|
|
train_dates = dates_sorted[:test_idx]
|
|||
|
|
test_date = dates_sorted[test_idx]
|
|||
|
|
|
|||
|
|
train = dataset[dataset[self.date_col].isin(train_dates)]
|
|||
|
|
test = dataset[dataset[self.date_col] == test_date]
|
|||
|
|
|
|||
|
|
X_train, y_train = self._prepare(train, feature_cols, fit=True)
|
|||
|
|
X_test, y_test = self._prepare(test, feature_cols, fit=False)
|
|||
|
|
|
|||
|
|
if len(X_test) == 0 or y_test.std() == 0:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Baseline
|
|||
|
|
records.append({
|
|||
|
|
"test_date": test_date,
|
|||
|
|
"model": "Baseline (zéro)",
|
|||
|
|
"mae": mean_absolute_error(y_test, np.zeros(len(y_test))),
|
|||
|
|
"r2": r2_score(y_test, np.zeros(len(y_test))),
|
|||
|
|
"n_test": len(y_test),
|
|||
|
|
"n_train": len(X_train),
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
for model_name, cfg in MODEL_CONFIGS.items():
|
|||
|
|
model = cfg["cls"](**cfg["kwargs"])
|
|||
|
|
scaler = StandardScaler() if cfg["scale"] else None
|
|||
|
|
|
|||
|
|
X_tr = scaler.fit_transform(X_train) if scaler else X_train
|
|||
|
|
X_te = scaler.transform(X_test) if scaler else X_test
|
|||
|
|
|
|||
|
|
model.fit(X_tr, y_train)
|
|||
|
|
preds = model.predict(X_te)
|
|||
|
|
|
|||
|
|
records.append({
|
|||
|
|
"test_date": test_date,
|
|||
|
|
"model": model_name,
|
|||
|
|
"mae": mean_absolute_error(y_test, preds),
|
|||
|
|
"r2": r2_score(y_test, preds),
|
|||
|
|
"n_test": len(y_test),
|
|||
|
|
"n_train": len(X_train),
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
self.results_df_ = pd.DataFrame(records)
|
|||
|
|
|
|||
|
|
if verbose:
|
|||
|
|
self._print_results_summary()
|
|||
|
|
|
|||
|
|
# Entraîner les modèles finaux sur toutes les données
|
|||
|
|
self._fit_final_models(dataset, feature_cols)
|
|||
|
|
|
|||
|
|
return self.results_df_
|
|||
|
|
|
|||
|
|
# ── Modèles finaux (pour importance des variables) ────────────────────────
|
|||
|
|
|
|||
|
|
def _fit_final_models(self, dataset: pd.DataFrame, feature_cols: list[str]):
|
|||
|
|
"""Entraîne chaque modèle sur l'intégralité du dataset (pour l'interprétation)."""
|
|||
|
|
X, y = self._prepare(dataset, feature_cols, fit=True)
|
|||
|
|
if len(X) == 0:
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
for model_name, cfg in MODEL_CONFIGS.items():
|
|||
|
|
model = cfg["cls"](**cfg["kwargs"])
|
|||
|
|
scaler = StandardScaler() if cfg["scale"] else None
|
|||
|
|
X_fit = scaler.fit_transform(X) if scaler else X
|
|||
|
|
model.fit(X_fit, y)
|
|||
|
|
self.final_models_[model_name] = (model, scaler, feature_cols)
|
|||
|
|
|
|||
|
|
# Importance des variables : Random Forest (Gini) + Permutation
|
|||
|
|
rf_model, _, _ = self.final_models_.get("Random Forest", (None, None, None))
|
|||
|
|
if rf_model is not None:
|
|||
|
|
self.importances_["gini"] = pd.Series(
|
|||
|
|
rf_model.feature_importances_, index=feature_cols
|
|||
|
|
).sort_values(ascending=False)
|
|||
|
|
|
|||
|
|
perm = permutation_importance(
|
|||
|
|
rf_model, X, y, n_repeats=10, random_state=42, n_jobs=-1
|
|||
|
|
)
|
|||
|
|
self.importances_["permutation"] = pd.Series(
|
|||
|
|
perm.importances_mean, index=feature_cols
|
|||
|
|
).sort_values(ascending=False)
|
|||
|
|
|
|||
|
|
# ── Prédiction ────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def predict(self, new_data: pd.DataFrame,
|
|||
|
|
model_name: str = "Random Forest") -> np.ndarray:
|
|||
|
|
"""Prédit les flux nets sur de nouvelles données."""
|
|||
|
|
if model_name not in self.final_models_:
|
|||
|
|
raise ValueError(f"Modèle '{model_name}' non disponible. "
|
|||
|
|
f"Disponibles : {list(self.final_models_.keys())}")
|
|||
|
|
|
|||
|
|
model, scaler, feature_cols = self.final_models_[model_name]
|
|||
|
|
X, _ = self._prepare(new_data, feature_cols, fit=False)
|
|||
|
|
X_pred = scaler.transform(X) if scaler else X
|
|||
|
|
return model.predict(X_pred)
|
|||
|
|
|
|||
|
|
# ── Visualisation ─────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def plot_results(self, output_path: str = "model_results.png"):
|
|||
|
|
"""Génère les graphiques de résultats du walk-forward."""
|
|||
|
|
|
|||
|
|
if self.results_df_.empty:
|
|||
|
|
print("⚠ Aucun résultat à visualiser (walk-forward non exécuté).")
|
|||
|
|
self._plot_schema(output_path)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
fig = plt.figure(figsize=(16, 14))
|
|||
|
|
fig.patch.set_facecolor("white")
|
|||
|
|
gs = gridspec.GridSpec(3, 2, figure=fig, hspace=0.45, wspace=0.35)
|
|||
|
|
|
|||
|
|
# ── [A] MAE par modèle et date ────────────────────────────────────────
|
|||
|
|
ax1 = fig.add_subplot(gs[0, :])
|
|||
|
|
for i, (model_name, grp) in enumerate(self.results_df_.groupby("model")):
|
|||
|
|
style = "--" if "Baseline" in model_name else "-"
|
|||
|
|
ax1.plot(grp["test_date"], grp["mae"],
|
|||
|
|
label=model_name, linewidth=1.8,
|
|||
|
|
color=COLORS[i % len(COLORS)], linestyle=style)
|
|||
|
|
ax1.set_title("Walk-Forward Validation — MAE par modèle", fontsize=13, fontweight="bold")
|
|||
|
|
ax1.set_ylabel("MAE (€)")
|
|||
|
|
ax1.legend(fontsize=9)
|
|||
|
|
ax1.tick_params(axis="x", rotation=20)
|
|||
|
|
|
|||
|
|
# ── [B] R² par modèle ─────────────────────────────────────────────────
|
|||
|
|
ax2 = fig.add_subplot(gs[1, 0])
|
|||
|
|
for i, (model_name, grp) in enumerate(self.results_df_.groupby("model")):
|
|||
|
|
if "Baseline" not in model_name:
|
|||
|
|
ax2.plot(grp["test_date"], grp["r2"].clip(-1, 1),
|
|||
|
|
label=model_name, linewidth=1.5,
|
|||
|
|
color=COLORS[i % len(COLORS)])
|
|||
|
|
ax2.axhline(0, color="black", linestyle="--", linewidth=1, alpha=0.5)
|
|||
|
|
ax2.set_title("R² par modèle", fontsize=12, fontweight="bold")
|
|||
|
|
ax2.set_ylabel("R²")
|
|||
|
|
ax2.legend(fontsize=9)
|
|||
|
|
ax2.tick_params(axis="x", rotation=20)
|
|||
|
|
|
|||
|
|
# ── [C] MAE agrégée (boîtes) ──────────────────────────────────────────
|
|||
|
|
ax3 = fig.add_subplot(gs[1, 1])
|
|||
|
|
model_names = self.results_df_["model"].unique().tolist()
|
|||
|
|
mae_by_model = [
|
|||
|
|
self.results_df_[self.results_df_["model"] == m]["mae"].dropna().values
|
|||
|
|
for m in model_names
|
|||
|
|
]
|
|||
|
|
bp = ax3.boxplot(mae_by_model, labels=model_names, patch_artist=True,
|
|||
|
|
medianprops=dict(color="black", linewidth=2))
|
|||
|
|
for patch, color in zip(bp["boxes"], COLORS):
|
|||
|
|
patch.set_facecolor(color)
|
|||
|
|
patch.set_alpha(0.7)
|
|||
|
|
ax3.set_title("Distribution de MAE (tous folds)", fontsize=12, fontweight="bold")
|
|||
|
|
ax3.set_ylabel("MAE (€)")
|
|||
|
|
ax3.tick_params(axis="x", rotation=20)
|
|||
|
|
|
|||
|
|
# ── [D] Importance des variables (Gini) ───────────────────────────────
|
|||
|
|
ax4 = fig.add_subplot(gs[2, 0])
|
|||
|
|
if "gini" in self.importances_:
|
|||
|
|
imp = self.importances_["gini"].head(15)
|
|||
|
|
colors_imp = [
|
|||
|
|
"#70ad47" if c.startswith("rel_") else
|
|||
|
|
"#ed7d31" if c.startswith("perf_") else
|
|||
|
|
"#1f4e79"
|
|||
|
|
for c in imp.index
|
|||
|
|
]
|
|||
|
|
ax4.barh(imp.index[::-1], imp.values[::-1], color=colors_imp[::-1])
|
|||
|
|
ax4.set_title("Importance (Gini) — Top 15 features", fontsize=12, fontweight="bold")
|
|||
|
|
ax4.set_xlabel("Importance relative")
|
|||
|
|
from matplotlib.patches import Patch
|
|||
|
|
legend_elements = [
|
|||
|
|
Patch(color="#70ad47", label="Perf relative (peers)"),
|
|||
|
|
Patch(color="#ed7d31", label="Perf absolue"),
|
|||
|
|
Patch(color="#1f4e79", label="Comportement AUM"),
|
|||
|
|
]
|
|||
|
|
ax4.legend(handles=legend_elements, fontsize=8, loc="lower right")
|
|||
|
|
else:
|
|||
|
|
ax4.axis("off")
|
|||
|
|
ax4.text(0.5, 0.5, "Importance des variables\nnon disponible",
|
|||
|
|
ha="center", va="center", fontsize=12)
|
|||
|
|
|
|||
|
|
# ── [E] Importance permutation ────────────────────────────────────────
|
|||
|
|
ax5 = fig.add_subplot(gs[2, 1])
|
|||
|
|
if "permutation" in self.importances_:
|
|||
|
|
pimp = self.importances_["permutation"].head(15)
|
|||
|
|
pimp = pimp[pimp > 0] # garder seulement les features utiles
|
|||
|
|
colors_pimp = [
|
|||
|
|
"#70ad47" if c.startswith("rel_") else
|
|||
|
|
"#ed7d31" if c.startswith("perf_") else
|
|||
|
|
"#1f4e79"
|
|||
|
|
for c in pimp.index
|
|||
|
|
]
|
|||
|
|
ax5.barh(pimp.index[::-1], pimp.values[::-1], color=colors_pimp[::-1])
|
|||
|
|
ax5.set_title("Permutation Importance — Top 15", fontsize=12, fontweight="bold")
|
|||
|
|
ax5.set_xlabel("Δ MAE moyen (permutation)")
|
|||
|
|
else:
|
|||
|
|
ax5.axis("off")
|
|||
|
|
|
|||
|
|
plt.suptitle("Carmignac × ENSAE — Résultats du modèle prédictif",
|
|||
|
|
fontsize=14, fontweight="bold", y=1.01)
|
|||
|
|
|
|||
|
|
plt.savefig(output_path, dpi=150, bbox_inches="tight", facecolor="white")
|
|||
|
|
plt.close()
|
|||
|
|
print(f"✅ Graphiques sauvegardés : {output_path}")
|
|||
|
|
|
|||
|
|
def _plot_schema(self, output_path: str):
|
|||
|
|
"""Affiche un schéma du pipeline si les données sont insuffisantes."""
|
|||
|
|
fig, ax = plt.subplots(figsize=(12, 6))
|
|||
|
|
fig.patch.set_facecolor("white")
|
|||
|
|
ax.axis("off")
|
|||
|
|
ax.set_xlim(0, 10)
|
|||
|
|
ax.set_ylim(0, 5)
|
|||
|
|
|
|||
|
|
schema = (
|
|||
|
|
"WALK-FORWARD VALIDATION — SCHÉMA\n\n"
|
|||
|
|
" t₁ t₂ t₃ t₄ t₅ t₆ t₇ ...\n"
|
|||
|
|
" ─────────────────────────────────\n"
|
|||
|
|
" TRAIN ████████ │TEST│\n"
|
|||
|
|
" TRAIN ███████████│TEST│\n"
|
|||
|
|
" TRAIN ██████████████│TEST│\n"
|
|||
|
|
" ...\n\n"
|
|||
|
|
"Principe :\n"
|
|||
|
|
" → Expanding window : la fenêtre de train s'agrandit à chaque fold\n"
|
|||
|
|
" → Test = 1 date future (mois suivant)\n"
|
|||
|
|
" → Aucune information future dans le train → pas de data leakage\n\n"
|
|||
|
|
"Métriques calculées à chaque fold :\n"
|
|||
|
|
" → MAE (Mean Absolute Error) en € de flux\n"
|
|||
|
|
" → R² (coefficient de détermination)\n\n"
|
|||
|
|
"Relancer avec les données complètes pour obtenir les résultats réels."
|
|||
|
|
)
|
|||
|
|
ax.text(0.5, 4.8, schema, va="top", fontsize=11, fontfamily="monospace",
|
|||
|
|
bbox=dict(boxstyle="round", facecolor="#eaf2fb", alpha=0.8))
|
|||
|
|
ax.set_title("Modèle prédictif — En attente de données complètes",
|
|||
|
|
fontsize=13, fontweight="bold")
|
|||
|
|
|
|||
|
|
plt.savefig(output_path, dpi=150, bbox_inches="tight", facecolor="white")
|
|||
|
|
plt.close()
|
|||
|
|
print(f"✅ Schéma sauvegardé : {output_path}")
|
|||
|
|
|
|||
|
|
# ── Helpers internes ──────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _prepare(self, df: pd.DataFrame,
|
|||
|
|
feature_cols: list[str],
|
|||
|
|
fit: bool) -> tuple[np.ndarray, np.ndarray]:
|
|||
|
|
"""Extrait X et y depuis le DataFrame, gère les NaN."""
|
|||
|
|
available = [c for c in feature_cols if c in df.columns]
|
|||
|
|
X = df[available].fillna(0).values
|
|||
|
|
y = df[self.target_col].values if self.target_col in df.columns else np.array([])
|
|||
|
|
return X, y
|
|||
|
|
|
|||
|
|
def _print_results_summary(self):
|
|||
|
|
print("\n── Résultats walk-forward (médiane sur tous les folds) ──")
|
|||
|
|
summary = (
|
|||
|
|
self.results_df_
|
|||
|
|
.groupby("model")
|
|||
|
|
.agg(
|
|||
|
|
MAE_median=("mae", "median"),
|
|||
|
|
MAE_mean=("mae", "mean"),
|
|||
|
|
R2_median=("r2", "median"),
|
|||
|
|
n_folds=("mae", "count"),
|
|||
|
|
)
|
|||
|
|
.round(4)
|
|||
|
|
.sort_values("MAE_median")
|
|||
|
|
)
|
|||
|
|
print(summary.to_string())
|
|||
|
|
print("─────────────────────────────────────────────────────────")
|
|||
|
|
|
|||
|
|
# ── API publique ──────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def get_best_model(self) -> str:
|
|||
|
|
"""Retourne le nom du modèle avec la MAE médiane la plus faible."""
|
|||
|
|
if self.results_df_.empty:
|
|||
|
|
return "Random Forest"
|
|||
|
|
summary = (self.results_df_
|
|||
|
|
[self.results_df_["model"] != "Baseline (zéro)"]
|
|||
|
|
.groupby("model")["mae"]
|
|||
|
|
.median())
|
|||
|
|
return summary.idxmin()
|
|||
|
|
|
|||
|
|
def get_top_features(self, n: int = 10,
|
|||
|
|
method: str = "permutation") -> list[str]:
|
|||
|
|
"""Retourne les n features les plus importantes."""
|
|||
|
|
if method not in self.importances_:
|
|||
|
|
method = list(self.importances_.keys())[0] if self.importances_ else None
|
|||
|
|
if method is None:
|
|||
|
|
return []
|
|||
|
|
return self.importances_[method].head(n).index.tolist()
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Usage autonome ────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
import sys
|
|||
|
|
from feature_engineering import FeatureBuilder
|
|||
|
|
|
|||
|
|
dataset_path = sys.argv[1] if len(sys.argv) > 1 else "dataset_features.csv"
|
|||
|
|
|
|||
|
|
dataset = pd.read_csv(dataset_path)
|
|||
|
|
feature_cols = FeatureBuilder.get_feature_columns(dataset)
|
|||
|
|
|
|||
|
|
print(f"Dataset : {dataset.shape} | {len(feature_cols)} features")
|
|||
|
|
|
|||
|
|
model = WalkForwardModel()
|
|||
|
|
results = model.fit_evaluate(dataset, feature_cols)
|
|||
|
|
model.plot_results("model_results.png")
|
|||
|
|
|
|||
|
|
if not results.empty:
|
|||
|
|
print(f"\nMeilleur modèle : {model.get_best_model()}")
|
|||
|
|
print(f"Top features : {model.get_top_features(5)}")
|