Project_Carmignac/clustering/data_loader.py

41 lines
1.4 KiB
Python
Raw Normal View History

2026-02-02 11:37:16 +01:00
import pandas as pd
2026-02-02 19:00:03 +01:00
import os
import s3fs
fs = s3fs.S3FileSystem(
client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},
key = os.environ["AWS_ACCESS_KEY_ID"],
secret = os.environ["AWS_SECRET_ACCESS_KEY"],
token = os.environ["AWS_SESSION_TOKEN"])
2026-02-02 11:37:16 +01:00
2026-02-02 19:00:03 +01:00
def load_and_clean_data(rates_path, gov_path):
2026-02-08 17:06:15 +01:00
# Enforce string types for IDs to prevent 'Mixed Type' warnings
dtype_spec = {
'Registrar Account - ID': str,
'Company - Id': str,
'Company - Ultimate Parent Id': str,
'Agreement - Code': str
}
2026-02-02 19:00:03 +01:00
with fs.open('s3://projet-bdc-carmignac-g3/AUM_repaired.csv', 'rb') as f:
2026-02-08 17:06:15 +01:00
aum = pd.read_csv(f, sep=",", dtype=dtype_spec)
2026-02-02 19:00:03 +01:00
with fs.open('s3://projet-bdc-carmignac-g3/flows.csv', 'rb') as f:
2026-02-08 17:06:15 +01:00
flows = pd.read_csv(f, sep=",", dtype=dtype_spec)
2026-02-02 12:31:08 +01:00
2026-02-02 11:37:16 +01:00
flows['Centralisation Date'] = pd.to_datetime(flows['Centralisation Date'])
aum['Centralisation Date'] = pd.to_datetime(aum['Centralisation Date'])
2026-02-02 12:31:08 +01:00
2026-02-08 17:06:15 +01:00
# Market data loading (Standardizing dates)
print("Loading Market Data...")
2026-02-02 11:37:16 +01:00
rates = pd.read_csv(rates_path)
try:
rates['Date'] = pd.to_datetime(rates['Date'], dayfirst=True)
except:
rates['Date'] = pd.to_datetime(rates['Date'])
gov = pd.read_csv(gov_path)
2026-02-08 17:06:15 +01:00
try:
gov['Date'] = pd.to_datetime(gov['Date'], dayfirst=True)
except:
gov['Date'] = pd.to_datetime(gov['Date'])
2026-02-02 11:37:16 +01:00
return flows, aum, rates, gov