Project_Carmignac/data_exploration/explore.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bd938e6e",
   "metadata": {},
   "source": [
    "**Short notebook to test connectivity with S3 services and explore the data**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "127753ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ae3c64fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import s3fs\n",
    "fs = s3fs.S3FileSystem(\n",
    "    client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
    "    key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
    "    secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
    "    token = os.environ[\"AWS_SESSION_TOKEN\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "84b9ac42",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def sample_by_blocks(df, block_size=10, num_blocks=10, random_state=None):\n",
    "    \"\"\"Sample num_blocks blocks of block_size consecutive rows (no overlapping blocks).\"\"\"\n",
    "    n = len(df)\n",
    "    max_start = n - block_size\n",
    "    if max_start < 0:\n",
    "        raise ValueError(f\"DataFrame has {n} rows, need at least {block_size}\")\n",
    "    if max_start + 1 < num_blocks:\n",
    "        raise ValueError(f\"Not enough room for {num_blocks} non-overlapping blocks (need at least {num_blocks * block_size} rows)\")\n",
    "    rng = np.random.default_rng(random_state)\n",
    "    chosen_starts = rng.choice(max_start + 1, size=num_blocks, replace=False)\n",
    "    chosen_starts.sort()  # blocks in order of position in original df\n",
    "    indices = np.concatenate([np.arange(s, s + block_size) for s in chosen_starts])\n",
    "    return df.iloc[indices].reset_index(drop=True)\n",
    "\n",
    "# sample_df = sample_by_blocks(df, block_size=10, num_blocks=10, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "83472648",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_10061/1081306672.py:2: DtypeWarning: Columns (0,1,2,3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(f, sep=\";\")\n"
     ]
    }
   ],
   "source": [
    "with fs.open('s3://projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n",
    "    df = pd.read_csv(f, sep=\";\")\n",
    "\n",
    "sample_df = sample_by_blocks(df, block_size=10, num_blocks=10, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "36ec4312",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_df.to_csv('aum_sample.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}