Project_Carmignac/clustering/data_loader.py

32 lines
1.1 KiB
Python

import pandas as pd
import os
import s3fs
fs = s3fs.S3FileSystem(
client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},
key = os.environ["AWS_ACCESS_KEY_ID"],
secret = os.environ["AWS_SECRET_ACCESS_KEY"],
token = os.environ["AWS_SESSION_TOKEN"])
def load_and_clean_data(rates_path, gov_path):
"""
Loads raw CSVs and parses dates for consistent time-series analysis.
"""
with fs.open('s3://projet-bdc-carmignac-g3/AUM_repaired.csv', 'rb') as f:
aum = pd.read_csv(f, sep =",")
with fs.open('s3://projet-bdc-carmignac-g3/flows.csv', 'rb') as f:
flows = pd.read_csv(f, sep =",")
flows['Centralisation Date'] = pd.to_datetime(flows['Centralisation Date'])
aum['Centralisation Date'] = pd.to_datetime(aum['Centralisation Date'])
rates = pd.read_csv(rates_path)
try:
rates['Date'] = pd.to_datetime(rates['Date'], dayfirst=True)
except:
rates['Date'] = pd.to_datetime(rates['Date'])
gov = pd.read_csv(gov_path)
gov['Date'] = pd.to_datetime(gov['Date'])
return flows, aum, rates, gov