Project_Carmignac/.ipynb_checkpoints/detection_rupture-checkpoint.py

87 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
def detect_ruptures(df, epsilon=0.05):
# Colonnes clés pour identifier les comptes
key_cols = [
'Agreement - Code',
'Company - Id',
'Company - Ultimate Parent Id',
'Registrar Account - Region',
'RegistrarAccount - Country',
'Registrar Account - ID'
]
# Travailler sur une copie
df_temp = df.copy()
# Colonnes de dates
df_temp['Centralisation Date'] = pd.to_datetime(df_temp['Centralisation Date'])
# Dates distinctes
full_dates = (
pd.Series(df_temp['Centralisation Date'].unique())
.sort_values()
.reset_index(drop=True)
)
# Combinaisons comptes × dates
accounts = df_temp[key_cols].drop_duplicates()
full_index = accounts.merge(
pd.DataFrame({'Centralisation Date': full_dates}),
how='cross'
)
# Agréger les AUM par clé
agg_cols = key_cols + ['Centralisation Date']
df_agg = (
df_temp.groupby(agg_cols)['Value - AUM €']
.sum()
.reset_index()
)
# Merge sur toutes les combinaisons
df_full = pd.merge(full_index, df_agg, on=agg_cols, how='left')
# Remplissage des trous par 0
df_full['Value - AUM €'] = df_full['Value - AUM €'].fillna(0)
# Tri
df_full = df_full.sort_values(key_cols + ['Centralisation Date'])
# Variation et valeur précédente
df_full['AUM_diff'] = df_full.groupby(key_cols)['Value - AUM €'].diff().fillna(0)
df_full['prev_value'] = df_full.groupby(key_cols)['Value - AUM €'].shift(1).fillna(0)
# Comptes qui perdent tout
df_zero = df_full[(df_full['AUM_diff'] < 0) & (df_full['Value - AUM €'] == 0)].copy()
# Comptes qui partent de 0
df_from_zero = df_full[(df_full['AUM_diff'] > 0) & (df_full['prev_value'] == 0)].copy()
# Colonnes pour le merge (sans ID)
merge_cols = [
'Centralisation Date',
'Agreement - Code',
'Company - Id',
'Company - Ultimate Parent Id',
'Registrar Account - Region',
'RegistrarAccount - Country'
]
# Détection des ruptures
ruptures = pd.merge(df_zero, df_from_zero, on=merge_cols, suffixes=('_old','_new'))
# Calcul de la différence relative selon epsilon
ruptures['diff_rel'] = abs(ruptures['AUM_diff_old'] + ruptures['AUM_diff_new']) / (
(abs(ruptures['AUM_diff_old']) + abs(ruptures['AUM_diff_new'])) / 2
)
# Filtrage avec epsilon
ruptures = ruptures[ruptures['diff_rel'] <= epsilon].drop(columns=['diff_rel'])
# Colonnes finales
ruptures_df = ruptures[['Centralisation Date','Registrar Account - ID_old','Registrar Account - ID_new','AUM_diff_new']]
ruptures_df.columns = ['date','old_account','new_account','value']
return ruptures_df