118 lines
3.2 KiB
Plaintext
118 lines
3.2 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "bd938e6e",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Short notebook to test connectivity with S3 services and explore the data**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "127753ac",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "ae3c64fe",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import s3fs\n",
|
|
"fs = s3fs.S3FileSystem(\n",
|
|
" client_kwargs={'endpoint_url': 'https://'+'minio-simple.lab.groupe-genes.fr'},\n",
|
|
" key = os.environ[\"AWS_ACCESS_KEY_ID\"], \n",
|
|
" secret = os.environ[\"AWS_SECRET_ACCESS_KEY\"], \n",
|
|
" token = os.environ[\"AWS_SESSION_TOKEN\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "84b9ac42",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"def sample_by_blocks(df, block_size=10, num_blocks=10, random_state=None):\n",
|
|
" \"\"\"Sample num_blocks blocks of block_size consecutive rows (no overlapping blocks).\"\"\"\n",
|
|
" n = len(df)\n",
|
|
" max_start = n - block_size\n",
|
|
" if max_start < 0:\n",
|
|
" raise ValueError(f\"DataFrame has {n} rows, need at least {block_size}\")\n",
|
|
" if max_start + 1 < num_blocks:\n",
|
|
" raise ValueError(f\"Not enough room for {num_blocks} non-overlapping blocks (need at least {num_blocks * block_size} rows)\")\n",
|
|
" rng = np.random.default_rng(random_state)\n",
|
|
" chosen_starts = rng.choice(max_start + 1, size=num_blocks, replace=False)\n",
|
|
" chosen_starts.sort() # blocks in order of position in original df\n",
|
|
" indices = np.concatenate([np.arange(s, s + block_size) for s in chosen_starts])\n",
|
|
" return df.iloc[indices].reset_index(drop=True)\n",
|
|
"\n",
|
|
"# sample_df = sample_by_blocks(df, block_size=10, num_blocks=10, random_state=42)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "83472648",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_10061/1081306672.py:2: DtypeWarning: Columns (0,1,2,3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
" df = pd.read_csv(f, sep=\";\")\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"with fs.open('s3://projet-bdc-data/carmignac/AUM ENSAE V2 -20251105.csv', 'rb') as f:\n",
|
|
" df = pd.read_csv(f, sep=\";\")\n",
|
|
"\n",
|
|
"sample_df = sample_by_blocks(df, block_size=10, num_blocks=10, random_state=42)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "36ec4312",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sample_df.to_csv('aum_sample.csv', index=False)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|