Project_Carmignac/data_exploration/explore.ipynb
2026-02-02 11:31:08 +00:00

227 lines
6.2 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "bd938e6e",
"metadata": {},
"source": [
"**Short notebook to test connectivity with S3 services and explore the data**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "127753ac",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ae3c64fe",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"fs = s3fs.S3FileSystem(\n",
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "84b9ac42",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def sample_by_blocks(df, block_size=10, num_blocks=10, random_state=None):\n",
" \"\"\"Sample num_blocks blocks of block_size consecutive rows (no overlapping blocks).\"\"\"\n",
" n = len(df)\n",
" max_start = n - block_size\n",
" if max_start < 0:\n",
" raise ValueError(f\"DataFrame has {n} rows, need at least {block_size}\")\n",
" if max_start + 1 < num_blocks:\n",
" raise ValueError(f\"Not enough room for {num_blocks} non-overlapping blocks (need at least {num_blocks * block_size} rows)\")\n",
" rng = np.random.default_rng(random_state)\n",
" chosen_starts = rng.choice(max_start + 1, size=num_blocks, replace=False)\n",
" chosen_starts.sort() # blocks in order of position in original df\n",
" indices = np.concatenate([np.arange(s, s + block_size) for s in chosen_starts])\n",
" return df.iloc[indices].reset_index(drop=True)\n",
"\n",
"# sample_df = sample_by_blocks(df, block_size=10, num_blocks=10, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "83472648",
"metadata": {},
"outputs": [],
"source": [
"with fs.open('s3://projet-bdc-data/carmignac/Data Modélisation/market data/esterRates.csv', 'rb') as f:\n",
" df = pd.read_csv(f, sep =\";\")\n",
"\n",
"sample_df = df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "79af063e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Date</th>\n",
" <th>Yld to Maturity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>31/12/2014</td>\n",
" <td>0.144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>02/01/2015</td>\n",
" <td>-0.079</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>05/01/2015</td>\n",
" <td>-0.074</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>06/01/2015</td>\n",
" <td>-0.075</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>07/01/2015</td>\n",
" <td>-0.069</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2821</th>\n",
" <td>16/10/2025</td>\n",
" <td>1.928</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2822</th>\n",
" <td>17/10/2025</td>\n",
" <td>1.928</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2823</th>\n",
" <td>20/10/2025</td>\n",
" <td>1.928</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2824</th>\n",
" <td>21/10/2025</td>\n",
" <td>1.927</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2825</th>\n",
" <td>22/10/2025</td>\n",
" <td>1.928</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2826 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" Date Yld to Maturity\n",
"0 31/12/2014 0.144\n",
"1 02/01/2015 -0.079\n",
"2 05/01/2015 -0.074\n",
"3 06/01/2015 -0.075\n",
"4 07/01/2015 -0.069\n",
"... ... ...\n",
"2821 16/10/2025 1.928\n",
"2822 17/10/2025 1.928\n",
"2823 20/10/2025 1.928\n",
"2824 21/10/2025 1.927\n",
"2825 22/10/2025 1.928\n",
"\n",
"[2826 rows x 2 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "36ec4312",
"metadata": {},
"outputs": [],
"source": [
"sample_df.to_csv('str_Rates.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}