From fb586b5348c51ae25039b21fac8eab73db52fac7 Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Sat, 13 Jan 2024 10:32:40 +0000 Subject: [PATCH] process df --- Notebook_AR.ipynb | 387 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 370 insertions(+), 17 deletions(-) diff --git a/Notebook_AR.ipynb b/Notebook_AR.ipynb index 3371cf3..9de5bc6 100644 --- a/Notebook_AR.ipynb +++ b/Notebook_AR.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "20eeb149-6618-4ef2-9cfd-ff062950f36c", "metadata": {}, "outputs": [], @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "30494c5e-9649-4fff-8708-617544188b20", "metadata": {}, "outputs": [ @@ -46,7 +46,7 @@ " 'bdc2324-data/9']" ] }, - "execution_count": 3, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "f1cce705-46e1-42de-8e93-2ee15312d288", "metadata": {}, "outputs": [], @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c", "metadata": {}, "outputs": [ @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "65cb38ad-52ae-4266-85d8-c47d81b00283", "metadata": {}, "outputs": [], @@ -715,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 10, "id": "590a132a-4f57-4ea3-a282-2ef913e4b753", "metadata": {}, "outputs": [], @@ -725,7 +725,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 11, "id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb", "metadata": {}, "outputs": [], @@ -735,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 12, "id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2", "metadata": {}, "outputs": [], @@ -750,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 13, "id": "2c478213-09ae-44ef-8c7c-125bcb571642", "metadata": {}, "outputs": [], @@ -768,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 14, "id": "327e44b0-eb99-4022-b4ca-79548072f0f0", "metadata": {}, "outputs": [], @@ -781,6 +781,22 @@ " return percent_missing" ] }, + { + "cell_type": "code", + "execution_count": 25, + "id": "10926def-267f-4e86-b2c9-72e27ff9a9df", + "metadata": {}, + "outputs": [], + "source": [ + "def process_df(df):\n", + " df = remove_horodates(df)\n", + " print(\"Number of columns : \", len(df.columns))\n", + " df = order_columns_id(df)\n", + " print(\"Columns : \", df.columns)\n", + " print(\"Percent of NA for each column : \", percent_na(df))\n", + " return df" + ] + }, { "cell_type": "markdown", "id": "98ac02cb-5295-47ca-99c6-99e622c5f388", @@ -791,7 +807,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 15, "id": "862a7658-0602-4d94-bb58-d23774c00d32", "metadata": {}, "outputs": [ @@ -961,7 +977,7 @@ "4 NaN f1c4689bc47dee6f60b56d74b593dd46 " ] }, - "execution_count": 32, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -974,7 +990,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 16, "id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28", "metadata": {}, "outputs": [ @@ -1127,7 +1143,7 @@ "4 8.5 False 0.0 NaN NaN " ] }, - "execution_count": 33, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1142,7 +1158,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 17, "id": "a383474f-7da9-422c-bb69-3f0cc0b7053f", "metadata": {}, "outputs": [ @@ -1172,7 +1188,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 18, "id": "460749ac-aa26-4216-8667-518546f72f72", "metadata": {}, "outputs": [ @@ -1200,6 +1216,343 @@ "percent_missing = products.isna().sum() * 100 / len(products)\n", "print(percent_missing)" ] + }, + { + "cell_type": "markdown", + "id": "ebcb48ab-adad-42e5-b5d7-7275771cd200", + "metadata": {}, + "source": [ + "#### Deep analysis of categories.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "3efce2b6-2d2f-4da9-98ed-1aae17da624c", + "metadata": {}, + "outputs": [], + "source": [ + "name_dataset = '1categories.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "38aa39fd-58af-4fb8-98f2-4269dbaf35de", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : bdc2324-data/1/1categories.csv\n", + "Shape : (27, 7)\n", + "Number of columns : 7\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecreated_atupdated_atextra_fieldquotaidentifier
030en nb entrées gr2020-09-03 13:21:20.019202+02:002020-09-03 13:21:20.019202+02:00NaNNaN849ab2791a14f5fc2bb4d87ab2b78bf6
116indiv activité enfant2020-09-03 13:11:23.306968+02:002020-09-03 13:11:23.306968+02:00NaNNaN425fd2f01984cc4ba030c1be98f42c33
239indiv activité gr2020-09-03 13:21:20.029901+02:002020-09-03 13:21:20.029901+02:00NaNNaN9244dd3738788db0d22a5d0afe687b69
31108groupe forfait adulte2020-09-19 02:06:43.145697+02:002020-09-19 02:06:43.145697+02:00NaNNaN3edda20c877a93b5ff883827238eb711
46groupe forfait entrées tr2020-09-03 13:11:23.264997+02:002020-09-03 13:11:23.264997+02:00NaNNaNff48df4b2dd5a14116bf4d280b31621e
\n", + "
" + ], + "text/plain": [ + " id name created_at \\\n", + "0 30 en nb entrées gr 2020-09-03 13:21:20.019202+02:00 \n", + "1 16 indiv activité enfant 2020-09-03 13:11:23.306968+02:00 \n", + "2 39 indiv activité gr 2020-09-03 13:21:20.029901+02:00 \n", + "3 1108 groupe forfait adulte 2020-09-19 02:06:43.145697+02:00 \n", + "4 6 groupe forfait entrées tr 2020-09-03 13:11:23.264997+02:00 \n", + "\n", + " updated_at extra_field quota \\\n", + "0 2020-09-03 13:21:20.019202+02:00 NaN NaN \n", + "1 2020-09-03 13:11:23.306968+02:00 NaN NaN \n", + "2 2020-09-03 13:21:20.029901+02:00 NaN NaN \n", + "3 2020-09-19 02:06:43.145697+02:00 NaN NaN \n", + "4 2020-09-03 13:11:23.264997+02:00 NaN NaN \n", + "\n", + " identifier \n", + "0 849ab2791a14f5fc2bb4d87ab2b78bf6 \n", + "1 425fd2f01984cc4ba030c1be98f42c33 \n", + "2 9244dd3738788db0d22a5d0afe687b69 \n", + "3 3edda20c877a93b5ff883827238eb711 \n", + "4 ff48df4b2dd5a14116bf4d280b31621e " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = display_databases(name_dataset)\n", + "print(\"Number of columns : \", len(df.columns))\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "99eb6d14-8b4b-4d55-8fc7-ddf2726096f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of columns : 5\n", + "Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n", + "Percent of NA for each column : id 0.000000\n", + "identifier 0.000000\n", + "name 3.703704\n", + "extra_field 100.000000\n", + "quota 100.000000\n", + "dtype: float64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ididentifiernameextra_fieldquota
030849ab2791a14f5fc2bb4d87ab2b78bf6en nb entrées grNaNNaN
116425fd2f01984cc4ba030c1be98f42c33indiv activité enfantNaNNaN
2399244dd3738788db0d22a5d0afe687b69indiv activité grNaNNaN
311083edda20c877a93b5ff883827238eb711groupe forfait adulteNaNNaN
46ff48df4b2dd5a14116bf4d280b31621egroupe forfait entrées trNaNNaN
\n", + "
" + ], + "text/plain": [ + " id identifier name \\\n", + "0 30 849ab2791a14f5fc2bb4d87ab2b78bf6 en nb entrées gr \n", + "1 16 425fd2f01984cc4ba030c1be98f42c33 indiv activité enfant \n", + "2 39 9244dd3738788db0d22a5d0afe687b69 indiv activité gr \n", + "3 1108 3edda20c877a93b5ff883827238eb711 groupe forfait adulte \n", + "4 6 ff48df4b2dd5a14116bf4d280b31621e groupe forfait entrées tr \n", + "\n", + " extra_field quota \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = process_df(df)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "c5f39cc9-dff8-452c-9a3e-9f7df81a8a19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "identifier object\n", + "name object\n", + "extra_field float64\n", + "quota float64\n", + "dtype: object" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "c4cb0b37-2262-45c0-97be-b12c503016e3", + "metadata": {}, + "source": [ + "#### Deep analysis of type_of_categories.csv" + ] + }, + { + "cell_type": "markdown", + "id": "3b4a3af9-ed12-43ec-b17e-fd425b238265", + "metadata": {}, + "source": [ + "#### Deep analysis of representation_category_capacities.csv" + ] + }, + { + "cell_type": "markdown", + "id": "135966fb-aab1-48d7-bb4c-39a53ee643ca", + "metadata": {}, + "source": [ + "#### Deep analysis of representations.csv" + ] + }, + { + "cell_type": "markdown", + "id": "b480f39f-d5c7-4ded-8f64-ea8ac31f5db5", + "metadata": {}, + "source": [ + "#### Deep analysis of events.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d52d6da-cca5-4abd-be05-2f00fd3eca8e", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {