From 41f49edd1c346884fd97672a125c7df1fa25d550 Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Wed, 6 Mar 2024 11:49:51 +0000 Subject: [PATCH] explore sport --- Sport/exploration_sport.ipynb | 119 ++++++++++++++++++++++++++++++++-- 1 file changed, 113 insertions(+), 6 deletions(-) diff --git a/Sport/exploration_sport.ipynb b/Sport/exploration_sport.ipynb index b60be94..bf66eaf 100644 --- a/Sport/exploration_sport.ipynb +++ b/Sport/exploration_sport.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 31, "id": "f62b996c-4e17-40ea-83ba-f0cb60be7671", "metadata": {}, "outputs": [ @@ -54,7 +54,7 @@ " 'bdc2324-data/9']" ] }, - "execution_count": 3, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -831,23 +831,130 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "id": "970302f5-4de2-46b4-a1ce-a5396f5330ab", "metadata": {}, + "outputs": [], + "source": [ + "def display_databases(directory_path, file_name):\n", + " \"\"\"\n", + " This function returns the file from s3 storage \n", + " \"\"\"\n", + " file_path = \"projet-bdc2324-team1\" + \"/Generalization/\" + directory_path + \"/\" + file_name + \".csv\"\n", + " print(\"File path : \", file_path)\n", + " with fs.open(file_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in, sep=\",\") \n", + " return df " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "f5bfae82-04aa-44e1-9869-3f4fd5736b41", + "metadata": { + "scrolled": true + }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/Generalization/sport/Train_set.csv\n" + ] + }, { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
c
\n", + "
" + ], "text/plain": [ - "" + "Empty DataFrame\n", + "Columns: [c]\n", + "Index: []" ] }, - "execution_count": 5, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "fs(" + "train_sport = display_databases('sport', 'Train_set')\n", + "train_sport.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "56d5b12e-45e8-4312-869d-bde4d24900b6", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'y_has_purchased'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/pandas/core/indexes/base.py:3802\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3802\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:153\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:182\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'y_has_purchased'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[51], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrain_sport\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43my_has_purchased\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39munique()\n", + "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/pandas/core/indexes/base.py:3809\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3805\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3807\u001b[0m ):\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3809\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3810\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3812\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'y_has_purchased'" + ] + } + ], + "source": [ + "train_sport['y_has_purchased'].unique()" + ] + }, + { + "cell_type": "raw", + "id": "bd8019ae-8d7b-4dfe-be93-abf80a497e13", + "metadata": {}, + "source": [ + "projet-bdc2324-team1/Generalization/sport/Train_set/dataset_train5.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d056c7b3-0e8c-485c-b2f3-4681077f1c2e", + "metadata": {}, + "outputs": [], + "source": [ + "fs.ls('projet-bdc2324-team1/Generalization/sport')" ] } ],