process df

This commit is contained in:
Alexis REVELLE 2024-01-13 10:32:40 +00:00
parent 5e28f0ddb3
commit fb586b5348

View File

@ -10,7 +10,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 5,
"id": "20eeb149-6618-4ef2-9cfd-ff062950f36c", "id": "20eeb149-6618-4ef2-9cfd-ff062950f36c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -22,7 +22,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 6,
"id": "30494c5e-9649-4fff-8708-617544188b20", "id": "30494c5e-9649-4fff-8708-617544188b20",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -46,7 +46,7 @@
" 'bdc2324-data/9']" " 'bdc2324-data/9']"
] ]
}, },
"execution_count": 3, "execution_count": 6,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -78,7 +78,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 7,
"id": "f1cce705-46e1-42de-8e93-2ee15312d288", "id": "f1cce705-46e1-42de-8e93-2ee15312d288",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -88,7 +88,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 8,
"id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c", "id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -136,7 +136,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 9,
"id": "65cb38ad-52ae-4266-85d8-c47d81b00283", "id": "65cb38ad-52ae-4266-85d8-c47d81b00283",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -715,7 +715,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 10,
"id": "590a132a-4f57-4ea3-a282-2ef913e4b753", "id": "590a132a-4f57-4ea3-a282-2ef913e4b753",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -725,7 +725,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 11,
"id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb", "id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -735,7 +735,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28, "execution_count": 12,
"id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2", "id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -750,7 +750,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 29, "execution_count": 13,
"id": "2c478213-09ae-44ef-8c7c-125bcb571642", "id": "2c478213-09ae-44ef-8c7c-125bcb571642",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -768,7 +768,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 40, "execution_count": 14,
"id": "327e44b0-eb99-4022-b4ca-79548072f0f0", "id": "327e44b0-eb99-4022-b4ca-79548072f0f0",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -781,6 +781,22 @@
" return percent_missing" " return percent_missing"
] ]
}, },
{
"cell_type": "code",
"execution_count": 25,
"id": "10926def-267f-4e86-b2c9-72e27ff9a9df",
"metadata": {},
"outputs": [],
"source": [
"def process_df(df):\n",
" df = remove_horodates(df)\n",
" print(\"Number of columns : \", len(df.columns))\n",
" df = order_columns_id(df)\n",
" print(\"Columns : \", df.columns)\n",
" print(\"Percent of NA for each column : \", percent_na(df))\n",
" return df"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "98ac02cb-5295-47ca-99c6-99e622c5f388", "id": "98ac02cb-5295-47ca-99c6-99e622c5f388",
@ -791,7 +807,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 15,
"id": "862a7658-0602-4d94-bb58-d23774c00d32", "id": "862a7658-0602-4d94-bb58-d23774c00d32",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -961,7 +977,7 @@
"4 NaN f1c4689bc47dee6f60b56d74b593dd46 " "4 NaN f1c4689bc47dee6f60b56d74b593dd46 "
] ]
}, },
"execution_count": 32, "execution_count": 15,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -974,7 +990,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 16,
"id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28", "id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1127,7 +1143,7 @@
"4 8.5 False 0.0 NaN NaN " "4 8.5 False 0.0 NaN NaN "
] ]
}, },
"execution_count": 33, "execution_count": 16,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1142,7 +1158,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 17,
"id": "a383474f-7da9-422c-bb69-3f0cc0b7053f", "id": "a383474f-7da9-422c-bb69-3f0cc0b7053f",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1172,7 +1188,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 38, "execution_count": 18,
"id": "460749ac-aa26-4216-8667-518546f72f72", "id": "460749ac-aa26-4216-8667-518546f72f72",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -1200,6 +1216,343 @@
"percent_missing = products.isna().sum() * 100 / len(products)\n", "percent_missing = products.isna().sum() * 100 / len(products)\n",
"print(percent_missing)" "print(percent_missing)"
] ]
},
{
"cell_type": "markdown",
"id": "ebcb48ab-adad-42e5-b5d7-7275771cd200",
"metadata": {},
"source": [
"#### Deep analysis of categories.csv"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "3efce2b6-2d2f-4da9-98ed-1aae17da624c",
"metadata": {},
"outputs": [],
"source": [
"name_dataset = '1categories.csv'"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "38aa39fd-58af-4fb8-98f2-4269dbaf35de",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1categories.csv\n",
"Shape : (27, 7)\n",
"Number of columns : 7\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>extra_field</th>\n",
" <th>quota</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>30</td>\n",
" <td>en nb entrées gr</td>\n",
" <td>2020-09-03 13:21:20.019202+02:00</td>\n",
" <td>2020-09-03 13:21:20.019202+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>849ab2791a14f5fc2bb4d87ab2b78bf6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16</td>\n",
" <td>indiv activité enfant</td>\n",
" <td>2020-09-03 13:11:23.306968+02:00</td>\n",
" <td>2020-09-03 13:11:23.306968+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>425fd2f01984cc4ba030c1be98f42c33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>39</td>\n",
" <td>indiv activité gr</td>\n",
" <td>2020-09-03 13:21:20.029901+02:00</td>\n",
" <td>2020-09-03 13:21:20.029901+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9244dd3738788db0d22a5d0afe687b69</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1108</td>\n",
" <td>groupe forfait adulte</td>\n",
" <td>2020-09-19 02:06:43.145697+02:00</td>\n",
" <td>2020-09-19 02:06:43.145697+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3edda20c877a93b5ff883827238eb711</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>groupe forfait entrées tr</td>\n",
" <td>2020-09-03 13:11:23.264997+02:00</td>\n",
" <td>2020-09-03 13:11:23.264997+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ff48df4b2dd5a14116bf4d280b31621e</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at \\\n",
"0 30 en nb entrées gr 2020-09-03 13:21:20.019202+02:00 \n",
"1 16 indiv activité enfant 2020-09-03 13:11:23.306968+02:00 \n",
"2 39 indiv activité gr 2020-09-03 13:21:20.029901+02:00 \n",
"3 1108 groupe forfait adulte 2020-09-19 02:06:43.145697+02:00 \n",
"4 6 groupe forfait entrées tr 2020-09-03 13:11:23.264997+02:00 \n",
"\n",
" updated_at extra_field quota \\\n",
"0 2020-09-03 13:21:20.019202+02:00 NaN NaN \n",
"1 2020-09-03 13:11:23.306968+02:00 NaN NaN \n",
"2 2020-09-03 13:21:20.029901+02:00 NaN NaN \n",
"3 2020-09-19 02:06:43.145697+02:00 NaN NaN \n",
"4 2020-09-03 13:11:23.264997+02:00 NaN NaN \n",
"\n",
" identifier \n",
"0 849ab2791a14f5fc2bb4d87ab2b78bf6 \n",
"1 425fd2f01984cc4ba030c1be98f42c33 \n",
"2 9244dd3738788db0d22a5d0afe687b69 \n",
"3 3edda20c877a93b5ff883827238eb711 \n",
"4 ff48df4b2dd5a14116bf4d280b31621e "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = display_databases(name_dataset)\n",
"print(\"Number of columns : \", len(df.columns))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "99eb6d14-8b4b-4d55-8fc7-ddf2726096f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 5\n",
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
"Percent of NA for each column : id 0.000000\n",
"identifier 0.000000\n",
"name 3.703704\n",
"extra_field 100.000000\n",
"quota 100.000000\n",
"dtype: float64\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>identifier</th>\n",
" <th>name</th>\n",
" <th>extra_field</th>\n",
" <th>quota</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>30</td>\n",
" <td>849ab2791a14f5fc2bb4d87ab2b78bf6</td>\n",
" <td>en nb entrées gr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16</td>\n",
" <td>425fd2f01984cc4ba030c1be98f42c33</td>\n",
" <td>indiv activité enfant</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>39</td>\n",
" <td>9244dd3738788db0d22a5d0afe687b69</td>\n",
" <td>indiv activité gr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1108</td>\n",
" <td>3edda20c877a93b5ff883827238eb711</td>\n",
" <td>groupe forfait adulte</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6</td>\n",
" <td>ff48df4b2dd5a14116bf4d280b31621e</td>\n",
" <td>groupe forfait entrées tr</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id identifier name \\\n",
"0 30 849ab2791a14f5fc2bb4d87ab2b78bf6 en nb entrées gr \n",
"1 16 425fd2f01984cc4ba030c1be98f42c33 indiv activité enfant \n",
"2 39 9244dd3738788db0d22a5d0afe687b69 indiv activité gr \n",
"3 1108 3edda20c877a93b5ff883827238eb711 groupe forfait adulte \n",
"4 6 ff48df4b2dd5a14116bf4d280b31621e groupe forfait entrées tr \n",
"\n",
" extra_field quota \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = process_df(df)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "c5f39cc9-dff8-452c-9a3e-9f7df81a8a19",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id int64\n",
"identifier object\n",
"name object\n",
"extra_field float64\n",
"quota float64\n",
"dtype: object"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "c4cb0b37-2262-45c0-97be-b12c503016e3",
"metadata": {},
"source": [
"#### Deep analysis of type_of_categories.csv"
]
},
{
"cell_type": "markdown",
"id": "3b4a3af9-ed12-43ec-b17e-fd425b238265",
"metadata": {},
"source": [
"#### Deep analysis of representation_category_capacities.csv"
]
},
{
"cell_type": "markdown",
"id": "135966fb-aab1-48d7-bb4c-39a53ee643ca",
"metadata": {},
"source": [
"#### Deep analysis of representations.csv"
]
},
{
"cell_type": "markdown",
"id": "b480f39f-d5c7-4ded-8f64-ea8ac31f5db5",
"metadata": {},
"source": [
"#### Deep analysis of events.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d52d6da-cca5-4abd-be05-2f00fd3eca8e",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {