Project_Carmignac/data_exploration/explore.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bd938e6e",
   "metadata": {},
   "source": [
    "**Short notebook to test connectivity with S3 services and explore the data**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "127753ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ae3c64fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import s3fs\n",
    "fs = s3fs.S3FileSystem(\n",
    "    client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
    "    key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
    "    secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
    "    token = os.environ[\"AWS_SESSION_TOKEN\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "84b9ac42",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def sample_by_blocks(df, block_size=10, num_blocks=10, random_state=None):\n",
    "    \"\"\"Sample num_blocks blocks of block_size consecutive rows (no overlapping blocks).\"\"\"\n",
    "    n = len(df)\n",
    "    max_start = n - block_size\n",
    "    if max_start < 0:\n",
    "        raise ValueError(f\"DataFrame has {n} rows, need at least {block_size}\")\n",
    "    if max_start + 1 < num_blocks:\n",
    "        raise ValueError(f\"Not enough room for {num_blocks} non-overlapping blocks (need at least {num_blocks * block_size} rows)\")\n",
    "    rng = np.random.default_rng(random_state)\n",
    "    chosen_starts = rng.choice(max_start + 1, size=num_blocks, replace=False)\n",
    "    chosen_starts.sort()  # blocks in order of position in original df\n",
    "    indices = np.concatenate([np.arange(s, s + block_size) for s in chosen_starts])\n",
    "    return df.iloc[indices].reset_index(drop=True)\n",
    "\n",
    "# sample_df = sample_by_blocks(df, block_size=10, num_blocks=10, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "83472648",
   "metadata": {},
   "outputs": [],
   "source": [
    "with fs.open('s3://projet-bdc-data/carmignac/Data Modélisation/market data/esterRates.csv', 'rb') as f:\n",
    "    df = pd.read_csv(f, sep =\";\")\n",
    "\n",
    "sample_df = df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "79af063e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Yld to Maturity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>31/12/2014</td>\n",
       "      <td>0.144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>02/01/2015</td>\n",
       "      <td>-0.079</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>05/01/2015</td>\n",
       "      <td>-0.074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>06/01/2015</td>\n",
       "      <td>-0.075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>07/01/2015</td>\n",
       "      <td>-0.069</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2821</th>\n",
       "      <td>16/10/2025</td>\n",
       "      <td>1.928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2822</th>\n",
       "      <td>17/10/2025</td>\n",
       "      <td>1.928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2823</th>\n",
       "      <td>20/10/2025</td>\n",
       "      <td>1.928</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2824</th>\n",
       "      <td>21/10/2025</td>\n",
       "      <td>1.927</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2825</th>\n",
       "      <td>22/10/2025</td>\n",
       "      <td>1.928</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2826 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            Date  Yld to Maturity\n",
       "0     31/12/2014            0.144\n",
       "1     02/01/2015           -0.079\n",
       "2     05/01/2015           -0.074\n",
       "3     06/01/2015           -0.075\n",
       "4     07/01/2015           -0.069\n",
       "...          ...              ...\n",
       "2821  16/10/2025            1.928\n",
       "2822  17/10/2025            1.928\n",
       "2823  20/10/2025            1.928\n",
       "2824  21/10/2025            1.927\n",
       "2825  22/10/2025            1.928\n",
       "\n",
       "[2826 rows x 2 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "36ec4312",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_df.to_csv('str_Rates.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}