Stats descriptives
This commit is contained in:
parent
7b47c1d61e
commit
45d5eb7df8
33
.ipynb_checkpoints/Stats_descriptives-checkpoint.ipynb
Normal file
33
.ipynb_checkpoints/Stats_descriptives-checkpoint.ipynb
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5e49fa70-05e3-4d9a-82db-e36ab3c993c7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
86
.ipynb_checkpoints/detection_rupture-checkpoint.py
Normal file
86
.ipynb_checkpoints/detection_rupture-checkpoint.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import pandas as pd
|
||||
|
||||
def detect_ruptures(df, epsilon=0.05):
|
||||
# Colonnes clés pour identifier les comptes
|
||||
key_cols = [
|
||||
'Agreement - Code',
|
||||
'Company - Id',
|
||||
'Company - Ultimate Parent Id',
|
||||
'Registrar Account - Region',
|
||||
'RegistrarAccount - Country',
|
||||
'Registrar Account - ID'
|
||||
]
|
||||
|
||||
# Travailler sur une copie
|
||||
df_temp = df.copy()
|
||||
|
||||
# Colonnes de dates
|
||||
df_temp['Centralisation Date'] = pd.to_datetime(df_temp['Centralisation Date'])
|
||||
|
||||
# Dates distinctes
|
||||
full_dates = (
|
||||
pd.Series(df_temp['Centralisation Date'].unique())
|
||||
.sort_values()
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
# Combinaisons comptes × dates
|
||||
accounts = df_temp[key_cols].drop_duplicates()
|
||||
full_index = accounts.merge(
|
||||
pd.DataFrame({'Centralisation Date': full_dates}),
|
||||
how='cross'
|
||||
)
|
||||
|
||||
# Agréger les AUM par clé
|
||||
agg_cols = key_cols + ['Centralisation Date']
|
||||
df_agg = (
|
||||
df_temp.groupby(agg_cols)['Value - AUM €']
|
||||
.sum()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Merge sur toutes les combinaisons
|
||||
df_full = pd.merge(full_index, df_agg, on=agg_cols, how='left')
|
||||
|
||||
# Remplissage des trous par 0
|
||||
df_full['Value - AUM €'] = df_full['Value - AUM €'].fillna(0)
|
||||
|
||||
# Tri
|
||||
df_full = df_full.sort_values(key_cols + ['Centralisation Date'])
|
||||
|
||||
# Variation et valeur précédente
|
||||
df_full['AUM_diff'] = df_full.groupby(key_cols)['Value - AUM €'].diff().fillna(0)
|
||||
df_full['prev_value'] = df_full.groupby(key_cols)['Value - AUM €'].shift(1).fillna(0)
|
||||
|
||||
# Comptes qui perdent tout
|
||||
df_zero = df_full[(df_full['AUM_diff'] < 0) & (df_full['Value - AUM €'] == 0)].copy()
|
||||
|
||||
# Comptes qui partent de 0
|
||||
df_from_zero = df_full[(df_full['AUM_diff'] > 0) & (df_full['prev_value'] == 0)].copy()
|
||||
|
||||
# Colonnes pour le merge (sans ID)
|
||||
merge_cols = [
|
||||
'Centralisation Date',
|
||||
'Agreement - Code',
|
||||
'Company - Id',
|
||||
'Company - Ultimate Parent Id',
|
||||
'Registrar Account - Region',
|
||||
'RegistrarAccount - Country'
|
||||
]
|
||||
|
||||
# Détection des ruptures
|
||||
ruptures = pd.merge(df_zero, df_from_zero, on=merge_cols, suffixes=('_old','_new'))
|
||||
|
||||
# Calcul de la différence relative selon epsilon
|
||||
ruptures['diff_rel'] = abs(ruptures['AUM_diff_old'] + ruptures['AUM_diff_new']) / (
|
||||
(abs(ruptures['AUM_diff_old']) + abs(ruptures['AUM_diff_new'])) / 2
|
||||
)
|
||||
|
||||
# Filtrage avec epsilon
|
||||
ruptures = ruptures[ruptures['diff_rel'] <= epsilon].drop(columns=['diff_rel'])
|
||||
|
||||
# Colonnes finales
|
||||
ruptures_df = ruptures[['Centralisation Date','Registrar Account - ID_old','Registrar Account - ID_new','AUM_diff_new']]
|
||||
ruptures_df.columns = ['date','old_account','new_account','value']
|
||||
|
||||
return ruptures_df
|
||||
72
.ipynb_checkpoints/function-checkpoint.py
Normal file
72
.ipynb_checkpoints/function-checkpoint.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
def evolution_2_comptes(df, id1, id2):
|
||||
def prepare_df(id):
|
||||
df_id = df[df['Registrar Account - ID'] == id].copy()
|
||||
df_id['Centralisation Date'] = pd.to_datetime(df_id['Centralisation Date'])
|
||||
df_agg = (
|
||||
df_id
|
||||
.groupby('Centralisation Date')['Quantity - AUM']
|
||||
.sum()
|
||||
.reset_index()
|
||||
.sort_values('Centralisation Date')
|
||||
)
|
||||
return df_agg
|
||||
|
||||
df1 = prepare_df(id1)
|
||||
df2 = prepare_df(id2)
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
# Courbe du premier compte
|
||||
plt.plot(df1['Centralisation Date'], df1['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id1}')
|
||||
|
||||
# Courbe du second compte
|
||||
plt.plot(df2['Centralisation Date'], df2['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id2}')
|
||||
|
||||
plt.title("Évolution des AUM pour deux comptes")
|
||||
plt.xlabel("Date")
|
||||
plt.ylabel("Quantity - AUM")
|
||||
plt.grid(True)
|
||||
plt.legend() # <- important pour distinguer les comptes
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
def evolution_3_comptes(df, id1, id2, id3):
|
||||
def prepare_df(id):
|
||||
df_id = df[df['Registrar Account - ID'] == id].copy()
|
||||
df_id['Centralisation Date'] = pd.to_datetime(df_id['Centralisation Date'])
|
||||
df_agg = (
|
||||
df_id
|
||||
.groupby('Centralisation Date')['Quantity - AUM']
|
||||
.sum()
|
||||
.reset_index()
|
||||
.sort_values('Centralisation Date')
|
||||
)
|
||||
return df_agg
|
||||
|
||||
df1 = prepare_df(id1)
|
||||
df2 = prepare_df(id2)
|
||||
df3 = prepare_df(id3)
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
plt.plot(df1['Centralisation Date'], df1['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id1}')
|
||||
|
||||
plt.plot(df2['Centralisation Date'], df2['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id2}')
|
||||
|
||||
plt.plot(df3['Centralisation Date'], df3['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id3}')
|
||||
|
||||
plt.title("Évolution des AUM pour trois comptes")
|
||||
plt.xlabel("Date")
|
||||
plt.ylabel("Quantity - AUM")
|
||||
plt.grid(True)
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
58
.ipynb_checkpoints/rupture-checkpoint.ipynb
Normal file
58
.ipynb_checkpoints/rupture-checkpoint.ipynb
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "132a1aa1-4cb9-49e7-9f45-c09dd8fd57c1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import s3fs\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"s3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||||
"\n",
|
||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': s3_ENDPOINT_URL})\n",
|
||||
"\n",
|
||||
"BUCKET = \"projet-bdc-data\"\n",
|
||||
"carmignac_path = \"projet-bdc-data/carmignac\"\n",
|
||||
"\n",
|
||||
"# Liste des fichiers AUM\n",
|
||||
"all_files = fs.ls(carmignac_path)\n",
|
||||
"aum_files = [f for f in all_files if \"AUM\" in f and f.endswith(\".csv\")]\n",
|
||||
"print(\"Fichiers AUM :\", aum_files)\n",
|
||||
"\n",
|
||||
"# Lire tous les fichiers dans un dictionnaire\n",
|
||||
"aum_data = {}\n",
|
||||
"for file_path in aum_files:\n",
|
||||
" with fs.open(file_path, 'r') as f:\n",
|
||||
" df = pd.read_csv(f, sep=';',low_memory=False)\n",
|
||||
" aum_data[os.path.basename(file_path)] = df\n",
|
||||
"\n",
|
||||
"df = aum_data['AUM ENSAE V2 -20251105.csv']"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
496
Essai.ipynb
496
Essai.ipynb
File diff suppressed because one or more lines are too long
892
Stats_descriptives.ipynb
Normal file
892
Stats_descriptives.ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
__pycache__/detection_rupture.cpython-313.pyc
Normal file
BIN
__pycache__/detection_rupture.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/function.cpython-313.pyc
Normal file
BIN
__pycache__/function.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/functions.cpython-313.pyc
Normal file
BIN
__pycache__/functions.cpython-313.pyc
Normal file
Binary file not shown.
86
detection_rupture.py
Normal file
86
detection_rupture.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
import pandas as pd
|
||||
|
||||
def detect_ruptures(df, epsilon=0.05):
|
||||
# Colonnes clés pour identifier les comptes
|
||||
key_cols = [
|
||||
'Agreement - Code',
|
||||
'Company - Id',
|
||||
'Company - Ultimate Parent Id',
|
||||
'Registrar Account - Region',
|
||||
'RegistrarAccount - Country',
|
||||
'Registrar Account - ID'
|
||||
]
|
||||
|
||||
# Travailler sur une copie
|
||||
df_temp = df.copy()
|
||||
|
||||
# Colonnes de dates
|
||||
df_temp['Centralisation Date'] = pd.to_datetime(df_temp['Centralisation Date'])
|
||||
|
||||
# Dates distinctes
|
||||
full_dates = (
|
||||
pd.Series(df_temp['Centralisation Date'].unique())
|
||||
.sort_values()
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
# Combinaisons comptes × dates
|
||||
accounts = df_temp[key_cols].drop_duplicates()
|
||||
full_index = accounts.merge(
|
||||
pd.DataFrame({'Centralisation Date': full_dates}),
|
||||
how='cross'
|
||||
)
|
||||
|
||||
# Agréger les AUM par clé
|
||||
agg_cols = key_cols + ['Centralisation Date']
|
||||
df_agg = (
|
||||
df_temp.groupby(agg_cols)['Value - AUM €']
|
||||
.sum()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
# Merge sur toutes les combinaisons
|
||||
df_full = pd.merge(full_index, df_agg, on=agg_cols, how='left')
|
||||
|
||||
# Remplissage des trous par 0
|
||||
df_full['Value - AUM €'] = df_full['Value - AUM €'].fillna(0)
|
||||
|
||||
# Tri
|
||||
df_full = df_full.sort_values(key_cols + ['Centralisation Date'])
|
||||
|
||||
# Variation et valeur précédente
|
||||
df_full['AUM_diff'] = df_full.groupby(key_cols)['Value - AUM €'].diff().fillna(0)
|
||||
df_full['prev_value'] = df_full.groupby(key_cols)['Value - AUM €'].shift(1).fillna(0)
|
||||
|
||||
# Comptes qui perdent tout
|
||||
df_zero = df_full[(df_full['AUM_diff'] < 0) & (df_full['Value - AUM €'] == 0)].copy()
|
||||
|
||||
# Comptes qui partent de 0
|
||||
df_from_zero = df_full[(df_full['AUM_diff'] > 0) & (df_full['prev_value'] == 0)].copy()
|
||||
|
||||
# Colonnes pour le merge (sans ID)
|
||||
merge_cols = [
|
||||
'Centralisation Date',
|
||||
'Agreement - Code',
|
||||
'Company - Id',
|
||||
'Company - Ultimate Parent Id',
|
||||
'Registrar Account - Region',
|
||||
'RegistrarAccount - Country'
|
||||
]
|
||||
|
||||
# Détection des ruptures
|
||||
ruptures = pd.merge(df_zero, df_from_zero, on=merge_cols, suffixes=('_old','_new'))
|
||||
|
||||
# Calcul de la différence relative selon epsilon
|
||||
ruptures['diff_rel'] = abs(ruptures['AUM_diff_old'] + ruptures['AUM_diff_new']) / (
|
||||
(abs(ruptures['AUM_diff_old']) + abs(ruptures['AUM_diff_new'])) / 2
|
||||
)
|
||||
|
||||
# Filtrage avec epsilon
|
||||
ruptures = ruptures[ruptures['diff_rel'] <= epsilon].drop(columns=['diff_rel'])
|
||||
|
||||
# Colonnes finales
|
||||
ruptures_df = ruptures[['Centralisation Date','Registrar Account - ID_old','Registrar Account - ID_new','AUM_diff_new']]
|
||||
ruptures_df.columns = ['date','old_account','new_account','value']
|
||||
|
||||
return ruptures_df
|
||||
73
function.py
Normal file
73
function.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
def evolution_2_comptes(df, id1, id2):
|
||||
def prepare_df(id):
|
||||
df_id = df[df['Registrar Account - ID'] == id].copy()
|
||||
df_id['Centralisation Date'] = pd.to_datetime(df_id['Centralisation Date'])
|
||||
df_agg = (
|
||||
df_id
|
||||
.groupby('Centralisation Date')['Quantity - AUM']
|
||||
.sum()
|
||||
.reset_index()
|
||||
.sort_values('Centralisation Date')
|
||||
)
|
||||
return df_agg
|
||||
|
||||
df1 = prepare_df(id1)
|
||||
df2 = prepare_df(id2)
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
# Courbe du premier compte
|
||||
plt.plot(df1['Centralisation Date'], df1['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id1}')
|
||||
|
||||
# Courbe du second compte
|
||||
plt.plot(df2['Centralisation Date'], df2['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id2}')
|
||||
|
||||
plt.title("Évolution des AUM pour deux comptes")
|
||||
plt.xlabel("Date")
|
||||
plt.ylabel("Quantity - AUM")
|
||||
plt.grid(True)
|
||||
plt.legend() # <- important pour distinguer les comptes
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
|
||||
def evolution_3_comptes(df, id1, id2, id3):
|
||||
def prepare_df(id):
|
||||
df_id = df[df['Registrar Account - ID'] == id].copy()
|
||||
df_id['Centralisation Date'] = pd.to_datetime(df_id['Centralisation Date'])
|
||||
df_agg = (
|
||||
df_id
|
||||
.groupby('Centralisation Date')['Quantity - AUM']
|
||||
.sum()
|
||||
.reset_index()
|
||||
.sort_values('Centralisation Date')
|
||||
)
|
||||
return df_agg
|
||||
|
||||
df1 = prepare_df(id1)
|
||||
df2 = prepare_df(id2)
|
||||
df3 = prepare_df(id3)
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
|
||||
plt.plot(df1['Centralisation Date'], df1['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id1}')
|
||||
|
||||
plt.plot(df2['Centralisation Date'], df2['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id2}')
|
||||
|
||||
plt.plot(df3['Centralisation Date'], df3['Quantity - AUM'],
|
||||
marker='.', linestyle='-', label=f'Account {id3}')
|
||||
|
||||
plt.title("Évolution des AUM pour trois comptes")
|
||||
plt.xlabel("Date")
|
||||
plt.ylabel("Quantity - AUM")
|
||||
plt.grid(True)
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
124
rupture.ipynb
Normal file
124
rupture.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user