Project_Carmignac/data_exploration/explore.ipynb
2026-02-02 09:24:49 +00:00

118 lines
3.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "bd938e6e",
"metadata": {},
"source": [
"**Short notebook to test connectivity with S3 services and explore the data**"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "127753ac",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ae3c64fe",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"fs = s3fs.S3FileSystem(\n",
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "84b9ac42",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def sample_by_blocks(df, block_size=10, num_blocks=10, random_state=None):\n",
" \"\"\"Sample num_blocks blocks of block_size consecutive rows (no overlapping blocks).\"\"\"\n",
" n = len(df)\n",
" max_start = n - block_size\n",
" if max_start < 0:\n",
" raise ValueError(f\"DataFrame has {n} rows, need at least {block_size}\")\n",
" if max_start + 1 < num_blocks:\n",
" raise ValueError(f\"Not enough room for {num_blocks} non-overlapping blocks (need at least {num_blocks * block_size} rows)\")\n",
" rng = np.random.default_rng(random_state)\n",
" chosen_starts = rng.choice(max_start + 1, size=num_blocks, replace=False)\n",
" chosen_starts.sort() # blocks in order of position in original df\n",
" indices = np.concatenate([np.arange(s, s + block_size) for s in chosen_starts])\n",
" return df.iloc[indices].reset_index(drop=True)\n",
"\n",
"# sample_df = sample_by_blocks(df, block_size=10, num_blocks=10, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "83472648",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_10061/1081306672.py:2: DtypeWarning: Columns (0,1,2,3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(f, sep=\";\")\n"
]
}
],
"source": [
"with fs.open('s3://projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n",
" df = pd.read_csv(f, sep=\";\")\n",
"\n",
"sample_df = sample_by_blocks(df, block_size=10, num_blocks=10, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "36ec4312",
"metadata": {},
"outputs": [],
"source": [
"sample_df.to_csv('aum_sample.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}