diff --git a/Notebook_AR.ipynb b/Notebook_AR.ipynb index a3c291b..3371cf3 100644 --- a/Notebook_AR.ipynb +++ b/Notebook_AR.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "20eeb149-6618-4ef2-9cfd-ff062950f36c", "metadata": {}, "outputs": [], @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "30494c5e-9649-4fff-8708-617544188b20", "metadata": {}, "outputs": [ @@ -46,7 +46,7 @@ " 'bdc2324-data/9']" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "f1cce705-46e1-42de-8e93-2ee15312d288", "metadata": {}, "outputs": [], @@ -88,10 +88,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bdc2324-data/8/8campaign_stats.csv\n", + "bdc2324-data/8/8campaigns.csv\n", + "bdc2324-data/8/8categories.csv\n", + "bdc2324-data/8/8countries.csv\n", + "bdc2324-data/8/8currencies.csv\n", + "bdc2324-data/8/8customer_target_mappings.csv\n", + "bdc2324-data/8/8customersplus.csv\n", + "bdc2324-data/8/8event_types.csv\n", + "bdc2324-data/8/8events.csv\n", + "bdc2324-data/8/8facilities.csv\n", + "bdc2324-data/8/8link_stats.csv\n", + "bdc2324-data/8/8pricing_formulas.csv\n", + "bdc2324-data/8/8product_packs.csv\n", + "bdc2324-data/8/8products.csv\n", + "bdc2324-data/8/8products_groups.csv\n", + "bdc2324-data/8/8purchases.csv\n", + "bdc2324-data/8/8representation_category_capacities.csv\n", + "bdc2324-data/8/8representations.csv\n", + "bdc2324-data/8/8seasons.csv\n", + "bdc2324-data/8/8suppliers.csv\n", + "bdc2324-data/8/8target_types.csv\n", + "bdc2324-data/8/8targets.csv\n", + "bdc2324-data/8/8tickets.csv\n", + "bdc2324-data/8/8type_of_categories.csv\n", + "bdc2324-data/8/8type_of_pricing_formulas.csv\n", + "bdc2324-data/8/8type_ofs.csv\n" + ] + } + ], "source": [ "# check the files in the directory\n", "\n", @@ -103,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "65cb38ad-52ae-4266-85d8-c47d81b00283", "metadata": {}, "outputs": [], @@ -670,13 +703,503 @@ "## Create Universal database" ] }, + { + "cell_type": "markdown", + "id": "7e460fbe-5067-4998-a1a8-9e3d07401750", + "metadata": {}, + "source": [ + "We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.\n", + "\n", + "Let's first create our procedure for the company 1 and the datasets belongings to the theme producst" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, + "id": "590a132a-4f57-4ea3-a282-2ef913e4b753", + "metadata": {}, + "outputs": [], + "source": [ + "directory_path = '1'" + ] + }, + { + "cell_type": "code", + "execution_count": 27, "id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "theme_products = [\"products.csv\" ,\"categories.csv\", \"type_of_categories.csv\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2", + "metadata": {}, + "outputs": [], + "source": [ + "def remove_horodates(df):\n", + " \"\"\"\n", + " this function remove horodate columns like created_at and updated_at\n", + " \"\"\"\n", + " df = df.drop(columns = [\"created_at\", \"updated_at\"])\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "2c478213-09ae-44ef-8c7c-125bcb571642", + "metadata": {}, + "outputs": [], + "source": [ + "def order_columns_id(df):\n", + " \"\"\"\n", + " this function puts all id columns at the beginning in order to read the dataset easier\n", + " \"\"\"\n", + " substring = 'id'\n", + " id_columns = [col for col in df.columns if substring in col]\n", + " remaining_col = [col for col in df.columns if substring not in col]\n", + " new_order = id_columns + remaining_col\n", + " return df[new_order]" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "327e44b0-eb99-4022-b4ca-79548072f0f0", + "metadata": {}, + "outputs": [], + "source": [ + "def percent_na(df):\n", + " \"\"\"\n", + " this function returns the percentage of na for each column\n", + " \"\"\"\n", + " percent_missing = df.isna().sum() * 100 / len(df)\n", + " return percent_missing" + ] + }, + { + "cell_type": "markdown", + "id": "98ac02cb-5295-47ca-99c6-99e622c5f388", + "metadata": {}, + "source": [ + "#### Deep analysis of products.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "862a7658-0602-4d94-bb58-d23774c00d32", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : bdc2324-data/1/1products.csv\n", + "Shape : (94803, 14)\n", + "Number of columns : 14\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idamountis_full_pricerepresentation_idpricing_formula_idcreated_atupdated_atcategory_idapply_priceproducts_group_idproduct_pack_idextra_fieldamount_consumptionidentifier
0106829.0False9141142020-09-03 14:09:43.119798+02:002020-09-03 14:09:43.119798+02:00410.0106551NaNNaN35c88f2db8a63d7474e46eb8ca9260e7
14789.5False2731312020-09-03 13:21:22.711773+02:002020-09-03 13:21:22.711773+02:0010.04711NaNNaN8a179671ab198e570e6a104c4451379f
22087311.5False2751372020-09-03 14:46:33.589030+02:002020-09-03 14:46:33.589030+02:0010.0208251NaNNaNee83779ce29e67ad251e40234b426d6a
31571428.0False8251992022-01-28 19:29:23.525722+01:002022-01-28 19:29:23.525722+01:0050.01567731NaNNaNd865383579314b791aa4bcf3fb418f17
413418.5False9932020-09-03 13:29:30.773089+02:002020-09-03 13:29:30.773089+02:0010.011751NaNNaNf1c4689bc47dee6f60b56d74b593dd46
\n", + "
" + ], + "text/plain": [ + " id amount is_full_price representation_id pricing_formula_id \\\n", + "0 10682 9.0 False 914 114 \n", + "1 478 9.5 False 273 131 \n", + "2 20873 11.5 False 275 137 \n", + "3 157142 8.0 False 82519 9 \n", + "4 1341 8.5 False 9 93 \n", + "\n", + " created_at updated_at \\\n", + "0 2020-09-03 14:09:43.119798+02:00 2020-09-03 14:09:43.119798+02:00 \n", + "1 2020-09-03 13:21:22.711773+02:00 2020-09-03 13:21:22.711773+02:00 \n", + "2 2020-09-03 14:46:33.589030+02:00 2020-09-03 14:46:33.589030+02:00 \n", + "3 2022-01-28 19:29:23.525722+01:00 2022-01-28 19:29:23.525722+01:00 \n", + "4 2020-09-03 13:29:30.773089+02:00 2020-09-03 13:29:30.773089+02:00 \n", + "\n", + " category_id apply_price products_group_id product_pack_id extra_field \\\n", + "0 41 0.0 10655 1 NaN \n", + "1 1 0.0 471 1 NaN \n", + "2 1 0.0 20825 1 NaN \n", + "3 5 0.0 156773 1 NaN \n", + "4 1 0.0 1175 1 NaN \n", + "\n", + " amount_consumption identifier \n", + "0 NaN 35c88f2db8a63d7474e46eb8ca9260e7 \n", + "1 NaN 8a179671ab198e570e6a104c4451379f \n", + "2 NaN ee83779ce29e67ad251e40234b426d6a \n", + "3 NaN d865383579314b791aa4bcf3fb418f17 \n", + "4 NaN f1c4689bc47dee6f60b56d74b593dd46 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "products = display_databases(\"1products.csv\")\n", + "print(\"Number of columns : \", len(products.columns))\n", + "products.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of columns : 12\n", + "Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n", + " 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n", + " 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n", + " dtype='object')\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idrepresentation_idpricing_formula_idcategory_idproducts_group_idproduct_pack_ididentifieramountis_full_priceapply_priceextra_fieldamount_consumption
0106829141144110655135c88f2db8a63d7474e46eb8ca9260e79.0False0.0NaNNaN
1478273131147118a179671ab198e570e6a104c4451379f9.5False0.0NaNNaN
2208732751371208251ee83779ce29e67ad251e40234b426d6a11.5False0.0NaNNaN
315714282519951567731d865383579314b791aa4bcf3fb418f178.0False0.0NaNNaN
41341993111751f1c4689bc47dee6f60b56d74b593dd468.5False0.0NaNNaN
\n", + "
" + ], + "text/plain": [ + " id representation_id pricing_formula_id category_id \\\n", + "0 10682 914 114 41 \n", + "1 478 273 131 1 \n", + "2 20873 275 137 1 \n", + "3 157142 82519 9 5 \n", + "4 1341 9 93 1 \n", + "\n", + " products_group_id product_pack_id identifier \\\n", + "0 10655 1 35c88f2db8a63d7474e46eb8ca9260e7 \n", + "1 471 1 8a179671ab198e570e6a104c4451379f \n", + "2 20825 1 ee83779ce29e67ad251e40234b426d6a \n", + "3 156773 1 d865383579314b791aa4bcf3fb418f17 \n", + "4 1175 1 f1c4689bc47dee6f60b56d74b593dd46 \n", + "\n", + " amount is_full_price apply_price extra_field amount_consumption \n", + "0 9.0 False 0.0 NaN NaN \n", + "1 9.5 False 0.0 NaN NaN \n", + "2 11.5 False 0.0 NaN NaN \n", + "3 8.0 False 0.0 NaN NaN \n", + "4 8.5 False 0.0 NaN NaN " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "products = remove_horodates(products)\n", + "print(\"Number of columns : \", len(products.columns))\n", + "products = order_columns_id(products)\n", + "print(\"Columns : \", products.columns)\n", + "products.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "a383474f-7da9-422c-bb69-3f0cc0b7053f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id int64\n", + "representation_id int64\n", + "pricing_formula_id int64\n", + "category_id int64\n", + "products_group_id int64\n", + "product_pack_id int64\n", + "identifier object\n", + "amount float64\n", + "is_full_price bool\n", + "apply_price float64\n", + "extra_field float64\n", + "amount_consumption float64\n", + "dtype: object\n" + ] + } + ], + "source": [ + "print(products.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "460749ac-aa26-4216-8667-518546f72f72", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0.0\n", + "representation_id 0.0\n", + "pricing_formula_id 0.0\n", + "category_id 0.0\n", + "products_group_id 0.0\n", + "product_pack_id 0.0\n", + "identifier 0.0\n", + "amount 0.0\n", + "is_full_price 0.0\n", + "apply_price 0.0\n", + "extra_field 100.0\n", + "amount_consumption 100.0\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "percent_missing = products.isna().sum() * 100 / len(products)\n", + "print(percent_missing)" + ] } ], "metadata": {