{
"cells": [
{
"cell_type": "markdown",
"id": "455cc769-1b3b-4fef-b395-e74a988ceed3",
"metadata": {},
"source": [
"## Notebook Alexis"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "20eeb149-6618-4ef2-9cfd-ff062950f36c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import s3fs"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "30494c5e-9649-4fff-8708-617544188b20",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/1',\n",
" 'bdc2324-data/10',\n",
" 'bdc2324-data/101',\n",
" 'bdc2324-data/11',\n",
" 'bdc2324-data/12',\n",
" 'bdc2324-data/13',\n",
" 'bdc2324-data/14',\n",
" 'bdc2324-data/2',\n",
" 'bdc2324-data/3',\n",
" 'bdc2324-data/4',\n",
" 'bdc2324-data/5',\n",
" 'bdc2324-data/6',\n",
" 'bdc2324-data/7',\n",
" 'bdc2324-data/8',\n",
" 'bdc2324-data/9']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"bdc2324-data\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "markdown",
"id": "2feffee9-9f23-4caa-8a01-9e4a93abbf5d",
"metadata": {},
"source": [
"### I. Analyse fichier 8"
]
},
{
"cell_type": "markdown",
"id": "f54ba449-2051-4acd-939d-d30abd5452fe",
"metadata": {},
"source": [
"This section describes the databases associated with company 8. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f1cce705-46e1-42de-8e93-2ee15312d288",
"metadata": {},
"outputs": [],
"source": [
"directory_path = '8'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bdc2324-data/8/8campaign_stats.csv\n",
"bdc2324-data/8/8campaigns.csv\n",
"bdc2324-data/8/8categories.csv\n",
"bdc2324-data/8/8countries.csv\n",
"bdc2324-data/8/8currencies.csv\n",
"bdc2324-data/8/8customer_target_mappings.csv\n",
"bdc2324-data/8/8customersplus.csv\n",
"bdc2324-data/8/8event_types.csv\n",
"bdc2324-data/8/8events.csv\n",
"bdc2324-data/8/8facilities.csv\n",
"bdc2324-data/8/8link_stats.csv\n",
"bdc2324-data/8/8pricing_formulas.csv\n",
"bdc2324-data/8/8product_packs.csv\n",
"bdc2324-data/8/8products.csv\n",
"bdc2324-data/8/8products_groups.csv\n",
"bdc2324-data/8/8purchases.csv\n",
"bdc2324-data/8/8representation_category_capacities.csv\n",
"bdc2324-data/8/8representations.csv\n",
"bdc2324-data/8/8seasons.csv\n",
"bdc2324-data/8/8suppliers.csv\n",
"bdc2324-data/8/8target_types.csv\n",
"bdc2324-data/8/8targets.csv\n",
"bdc2324-data/8/8tickets.csv\n",
"bdc2324-data/8/8type_of_categories.csv\n",
"bdc2324-data/8/8type_of_pricing_formulas.csv\n",
"bdc2324-data/8/8type_ofs.csv\n"
]
}
],
"source": [
"# check the files in the directory\n",
"\n",
"objects = fs.ls(f'{BUCKET}/{directory_path}')\n",
"\n",
"for file in objects:\n",
" print(file)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "65cb38ad-52ae-4266-85d8-c47d81b00283",
"metadata": {},
"outputs": [],
"source": [
"def display_databases(file_name):\n",
" \"\"\"\n",
" This function returns the file from s3 storage\n",
" \"\"\"\n",
" file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
" print(\"File path : \", file_path)\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
" \n",
" print(\"Shape : \", df.shape)\n",
" return df\n",
" "
]
},
{
"cell_type": "markdown",
"id": "ddd545ef-7e9f-4696-962a-115294991641",
"metadata": {},
"source": [
"#### Lookt at campaigns files"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0214d30d-5f83-498f-867f-e67b5793b731",
"metadata": {},
"outputs": [],
"source": [
"campaigns = display_databases(\"8campaigns.csv\")\n",
"campaigns.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7982be4-2c42-4a91-be5a-329a999644cc",
"metadata": {},
"outputs": [],
"source": [
"campaign_stats = display_databases(\"8campaign_stats.csv\")\n",
"campaign_stats.head()"
]
},
{
"cell_type": "markdown",
"id": "e6512bc9-91f5-4fe4-a637-a4e84dc497a9",
"metadata": {},
"source": [
"#### Look at links files"
]
},
{
"cell_type": "markdown",
"id": "28e7c1fe-470f-4d84-87b8-a711a973500b",
"metadata": {},
"source": [
"There is no links file for these company. Only the link_stats file"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e973575b-4ed6-4b23-8024-f383ac82e87c",
"metadata": {},
"outputs": [],
"source": [
"links_stats = display_databases(\"8link_stats.csv\")\n",
"links_stats.head()"
]
},
{
"cell_type": "markdown",
"id": "8dfcca1f-1323-413f-aa8d-3ee5ce2610a8",
"metadata": {},
"source": [
"#### Analyse Customersplus file"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b523575-c779-451c-a12e-a36fb4ad232c",
"metadata": {},
"outputs": [],
"source": [
"file_name = \"8customersplus.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"with fs.open(file_path, mode=\"rb\") as file_in:\n",
" customersplus = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"customersplus.head()"
]
},
{
"cell_type": "markdown",
"id": "fe56785a-ed3c-4322-aafa-a630f97b836f",
"metadata": {},
"source": [
"#### Analyse Structures files"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87d801fc-d19a-4c45-9b21-9b6d7a8451fd",
"metadata": {},
"outputs": [],
"source": [
"file_name = \"8structures.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"try:\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" structures = pd.read_csv(file_in, sep=\",\")\n",
"except:\n",
" print(\"No structures database\")"
]
},
{
"cell_type": "markdown",
"id": "b8452558-2d32-459b-91e7-f6042345e465",
"metadata": {},
"source": [
"For Stade Français, there is no structures, tags and structure_tag_mapping databases"
]
},
{
"cell_type": "markdown",
"id": "285b1422-9ca9-4afd-b752-777a54aaa677",
"metadata": {},
"source": [
"#### Analyze Target databases"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6e4c3ea-5ccf-4aec-bd2d-79a5a1194178",
"metadata": {},
"outputs": [],
"source": [
"file_name = \"8customer_target_mappings.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"try:\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" customer_targets = pd.read_csv(file_in, sep=\",\")\n",
" \n",
"except:\n",
" print(\"No such database in s3\")\n",
"\n",
"print(\"Shape : \", customer_targets.shape)\n",
"customer_targets.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e81a35c-3c6f-403d-9ebd-e8399ecd4263",
"metadata": {},
"outputs": [],
"source": [
"file_name = \"8targets.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"try:\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" targets = pd.read_csv(file_in, sep=\",\")\n",
" \n",
"except:\n",
" print(\"No such database in s3\")\n",
"\n",
"print(\"Shape : \", targets.shape)\n",
"targets.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85696d74-3b2f-4368-9045-44db5322b60d",
"metadata": {},
"outputs": [],
"source": [
"file_name = \"8target_types.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
"print(file_path)\n",
"try:\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" target_types = pd.read_csv(file_in, sep=\",\")\n",
" \n",
"except:\n",
" print(\"No such database in s3\")\n",
"\n",
"print(\"Shape : \", target_types.shape)\n",
"target_types.head()"
]
},
{
"cell_type": "markdown",
"id": "cdc6416b-3deb-446c-8957-435745b93533",
"metadata": {},
"source": [
"#### Analyze consumption files"
]
},
{
"cell_type": "markdown",
"id": "f8622bd5-a5ab-403f-ab01-758aec879ee4",
"metadata": {},
"source": [
"Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv\n",
"\n",
"However, there is no consumptions.csv file"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c57529b-2ffb-4039-9795-b27c6fbd54a4",
"metadata": {},
"outputs": [],
"source": [
"purchases = display_databases(\"8purchases.csv\")\n",
"purchases.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "903321fb-99f8-475d-b4a6-c70ec2efe190",
"metadata": {},
"outputs": [],
"source": [
"tickets = display_databases(\"8tickets.csv\")\n",
"tickets.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "243e6942-0233-4cd5-b32b-e005457131d2",
"metadata": {},
"outputs": [],
"source": [
"suppliers = display_databases(\"8suppliers.csv\")\n",
"suppliers.head()"
]
},
{
"cell_type": "markdown",
"id": "fd8c876a-f0c5-4123-a422-c267af5f29b1",
"metadata": {},
"source": [
"#### Analyse product file"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b82efce-1dee-4d89-8585-28c4ad477eef",
"metadata": {},
"outputs": [],
"source": [
"products = display_databases(\"8products.csv\")\n",
"products.head()"
]
},
{
"cell_type": "markdown",
"id": "8ad143b2-2869-4bd2-982e-688498b98727",
"metadata": {},
"source": [
"#### Analyze pricing files"
]
},
{
"cell_type": "markdown",
"id": "9a54e9a5-801d-4000-9e76-e792edbf7e41",
"metadata": {},
"source": [
"Meaning pricing_formulas.csv and type_of_pricing_formulas"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "daf37bff-a26d-4ff5-ad50-c90f917164bd",
"metadata": {},
"outputs": [],
"source": [
"pricing_formulas = display_databases(\"8pricing_formulas.csv\")\n",
"pricing_formulas.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cdb14488-b093-4b39-84fa-1c2b4576208f",
"metadata": {},
"outputs": [],
"source": [
"type_pricing_formulas = display_databases(\"8type_of_pricing_formulas.csv\")\n",
"type_pricing_formulas.head()"
]
},
{
"cell_type": "markdown",
"id": "a084297a-4fd7-4cda-b513-7704f4244a5c",
"metadata": {},
"source": [
"#### Analyze type of products"
]
},
{
"cell_type": "markdown",
"id": "76a67ea7-8720-441e-8973-23e5d105370e",
"metadata": {},
"source": [
"Meaning categories.csv, type_of_categories.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6582694d-5339-4f33-a943-c73033121a90",
"metadata": {},
"outputs": [],
"source": [
"categories = display_databases(\"8categories.csv\")\n",
"categories.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "589076df-1958-42de-9941-1aff9fa8536f",
"metadata": {},
"outputs": [],
"source": [
"type_categories = display_databases(\"8type_of_categories.csv\")\n",
"type_categories.head()"
]
},
{
"cell_type": "markdown",
"id": "3427b681-4c05-4e4e-9c2b-867ee789f98c",
"metadata": {},
"source": [
"#### Analyze type of representations"
]
},
{
"cell_type": "markdown",
"id": "9381e36b-090a-44c5-a29d-3ac4c9a4431e",
"metadata": {},
"source": [
"Meaning representation_category_capacities.csv, representations.csv, representations_types.csv\n",
"\n",
"however there is no representation_types database"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f06d72a-5725-4eee-8e4c-e9ef5820f346",
"metadata": {},
"outputs": [],
"source": [
"representation_category_capacities = display_databases(\"8representation_category_capacities.csv\")\n",
"representation_category_capacities.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd405913-033d-4f15-a5b9-103d577baaff",
"metadata": {},
"outputs": [],
"source": [
"representations = display_databases(\"8representations.csv\")\n",
"representations.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f2c7ea3-6964-48fd-9411-17547b2c3a3f",
"metadata": {},
"outputs": [],
"source": [
"#representation_type = display_databases(\"8representation_types.csv\")"
]
},
{
"cell_type": "markdown",
"id": "a9b02406-2a69-4431-8d49-3c6bd6a5e1c7",
"metadata": {},
"source": [
"#### Analyze type of events"
]
},
{
"cell_type": "markdown",
"id": "1d554266-282c-4f64-9a0f-ddcf591ec912",
"metadata": {},
"source": [
"Meaning events.csv, event_types.csv, seasons.csv and facilities.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cba22ee2-338d-4ce1-a1e8-829a11a94bcf",
"metadata": {},
"outputs": [],
"source": [
"events = display_databases(\"8events.csv\")\n",
"events.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3db00b9d-2187-4cb6-980d-8ac6ab9eb460",
"metadata": {},
"outputs": [],
"source": [
"event_types = display_databases(\"8event_types.csv\")\n",
"event_types.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cba0ee58-6280-45fe-99b3-0be09db5922b",
"metadata": {},
"outputs": [],
"source": [
"seasons = display_databases(\"8seasons.csv\")\n",
"seasons.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fa82fd7-d6d3-4857-af24-ea573b1129d0",
"metadata": {},
"outputs": [],
"source": [
"facilities = display_databases(\"8facilities.csv\")\n",
"facilities.head()"
]
},
{
"cell_type": "markdown",
"id": "c7467d41-0ded-465d-bb08-15be914a166b",
"metadata": {},
"source": [
"#### Analyze annexe databases"
]
},
{
"cell_type": "markdown",
"id": "17e9e334-0ae4-48d8-bed5-b50b4af49d5b",
"metadata": {},
"source": [
"Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc"
]
},
{
"cell_type": "markdown",
"id": "d3ec1040-48b2-40bb-8947-920ddb4589f3",
"metadata": {},
"source": [
"## II. Identify Commons Datasets"
]
},
{
"cell_type": "markdown",
"id": "ec528a8a-df38-48e2-a1be-4a1459a80a1e",
"metadata": {},
"source": [
"From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "c240b811-48a6-4501-9e70-bc51d69e3ac4",
"metadata": {},
"outputs": [],
"source": [
"## We first construct a dictionary reporting all the datasets for each companies\n",
"\n",
"companies = fs.ls(BUCKET)\n",
"companies_database = {}\n",
"\n",
"for company in companies:\n",
" companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "54057367-9df9-42f4-aa07-bf524bb76462",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of databases : 30\n"
]
}
],
"source": [
"# Then we create a list of all database\n",
"\n",
"all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n",
"print(\"Number of databases : \",len(all_database))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "63914e20-9efc-4088-877b-edab5f225d00",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30\n",
"23\n"
]
}
],
"source": [
"## We then create a set of database in common for all companies\n",
"\n",
"data_in_common = set(all_database)\n",
"\n",
"print(len(data_in_common))\n",
"\n",
"for key in companies_database:\n",
" diff_database = data_in_common.symmetric_difference(companies_database[key])\n",
" data_in_common = data_in_common - diff_database\n",
"\n",
"print(len(data_in_common))\n",
" "
]
},
{
"cell_type": "markdown",
"id": "676d8536-7d8c-4075-a357-b8d06e501ca8",
"metadata": {},
"source": [
"## Create Universal database"
]
},
{
"cell_type": "markdown",
"id": "7e460fbe-5067-4998-a1a8-9e3d07401750",
"metadata": {},
"source": [
"We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.\n",
"\n",
"Let's first create our procedure for the company 1 and the datasets belongings to the theme producst"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "590a132a-4f57-4ea3-a282-2ef913e4b753",
"metadata": {},
"outputs": [],
"source": [
"directory_path = '1'"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb",
"metadata": {},
"outputs": [],
"source": [
"theme_products = [\"products.csv\" ,\"categories.csv\", \"type_of_categories.csv\"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2",
"metadata": {},
"outputs": [],
"source": [
"def remove_horodates(df):\n",
" \"\"\"\n",
" this function remove horodate columns like created_at and updated_at\n",
" \"\"\"\n",
" df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "2c478213-09ae-44ef-8c7c-125bcb571642",
"metadata": {},
"outputs": [],
"source": [
"def order_columns_id(df):\n",
" \"\"\"\n",
" this function puts all id columns at the beginning in order to read the dataset easier\n",
" \"\"\"\n",
" substring = 'id'\n",
" id_columns = [col for col in df.columns if substring in col]\n",
" remaining_col = [col for col in df.columns if substring not in col]\n",
" new_order = id_columns + remaining_col\n",
" return df[new_order]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "327e44b0-eb99-4022-b4ca-79548072f0f0",
"metadata": {},
"outputs": [],
"source": [
"def percent_na(df):\n",
" \"\"\"\n",
" this function returns the percentage of na for each column\n",
" \"\"\"\n",
" percent_missing = df.isna().sum() * 100 / len(df)\n",
" return percent_missing"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "10926def-267f-4e86-b2c9-72e27ff9a9df",
"metadata": {},
"outputs": [],
"source": [
"def process_df(df):\n",
" df = remove_horodates(df)\n",
" print(\"Number of columns : \", len(df.columns))\n",
" df = order_columns_id(df)\n",
" print(\"Columns : \", df.columns)\n",
" print(\"Percent of NA for each column : \", percent_na(df))\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "98ac02cb-5295-47ca-99c6-99e622c5f388",
"metadata": {},
"source": [
"#### Deep analysis of products.csv"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "862a7658-0602-4d94-bb58-d23774c00d32",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1products.csv\n",
"Shape : (94803, 14)\n",
"Number of columns : 14\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" amount | \n",
" is_full_price | \n",
" representation_id | \n",
" pricing_formula_id | \n",
" created_at | \n",
" updated_at | \n",
" category_id | \n",
" apply_price | \n",
" products_group_id | \n",
" product_pack_id | \n",
" extra_field | \n",
" amount_consumption | \n",
" identifier | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 10682 | \n",
" 9.0 | \n",
" False | \n",
" 914 | \n",
" 114 | \n",
" 2020-09-03 14:09:43.119798+02:00 | \n",
" 2020-09-03 14:09:43.119798+02:00 | \n",
" 41 | \n",
" 0.0 | \n",
" 10655 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" 35c88f2db8a63d7474e46eb8ca9260e7 | \n",
"
\n",
" \n",
" 1 | \n",
" 478 | \n",
" 9.5 | \n",
" False | \n",
" 273 | \n",
" 131 | \n",
" 2020-09-03 13:21:22.711773+02:00 | \n",
" 2020-09-03 13:21:22.711773+02:00 | \n",
" 1 | \n",
" 0.0 | \n",
" 471 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" 8a179671ab198e570e6a104c4451379f | \n",
"
\n",
" \n",
" 2 | \n",
" 20873 | \n",
" 11.5 | \n",
" False | \n",
" 275 | \n",
" 137 | \n",
" 2020-09-03 14:46:33.589030+02:00 | \n",
" 2020-09-03 14:46:33.589030+02:00 | \n",
" 1 | \n",
" 0.0 | \n",
" 20825 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" ee83779ce29e67ad251e40234b426d6a | \n",
"
\n",
" \n",
" 3 | \n",
" 157142 | \n",
" 8.0 | \n",
" False | \n",
" 82519 | \n",
" 9 | \n",
" 2022-01-28 19:29:23.525722+01:00 | \n",
" 2022-01-28 19:29:23.525722+01:00 | \n",
" 5 | \n",
" 0.0 | \n",
" 156773 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" d865383579314b791aa4bcf3fb418f17 | \n",
"
\n",
" \n",
" 4 | \n",
" 1341 | \n",
" 8.5 | \n",
" False | \n",
" 9 | \n",
" 93 | \n",
" 2020-09-03 13:29:30.773089+02:00 | \n",
" 2020-09-03 13:29:30.773089+02:00 | \n",
" 1 | \n",
" 0.0 | \n",
" 1175 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" f1c4689bc47dee6f60b56d74b593dd46 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id amount is_full_price representation_id pricing_formula_id \\\n",
"0 10682 9.0 False 914 114 \n",
"1 478 9.5 False 273 131 \n",
"2 20873 11.5 False 275 137 \n",
"3 157142 8.0 False 82519 9 \n",
"4 1341 8.5 False 9 93 \n",
"\n",
" created_at updated_at \\\n",
"0 2020-09-03 14:09:43.119798+02:00 2020-09-03 14:09:43.119798+02:00 \n",
"1 2020-09-03 13:21:22.711773+02:00 2020-09-03 13:21:22.711773+02:00 \n",
"2 2020-09-03 14:46:33.589030+02:00 2020-09-03 14:46:33.589030+02:00 \n",
"3 2022-01-28 19:29:23.525722+01:00 2022-01-28 19:29:23.525722+01:00 \n",
"4 2020-09-03 13:29:30.773089+02:00 2020-09-03 13:29:30.773089+02:00 \n",
"\n",
" category_id apply_price products_group_id product_pack_id extra_field \\\n",
"0 41 0.0 10655 1 NaN \n",
"1 1 0.0 471 1 NaN \n",
"2 1 0.0 20825 1 NaN \n",
"3 5 0.0 156773 1 NaN \n",
"4 1 0.0 1175 1 NaN \n",
"\n",
" amount_consumption identifier \n",
"0 NaN 35c88f2db8a63d7474e46eb8ca9260e7 \n",
"1 NaN 8a179671ab198e570e6a104c4451379f \n",
"2 NaN ee83779ce29e67ad251e40234b426d6a \n",
"3 NaN d865383579314b791aa4bcf3fb418f17 \n",
"4 NaN f1c4689bc47dee6f60b56d74b593dd46 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products = display_databases(\"1products.csv\")\n",
"print(\"Number of columns : \", len(products.columns))\n",
"products.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 12\n",
"Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
" 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
" 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" representation_id | \n",
" pricing_formula_id | \n",
" category_id | \n",
" products_group_id | \n",
" product_pack_id | \n",
" identifier | \n",
" amount | \n",
" is_full_price | \n",
" apply_price | \n",
" extra_field | \n",
" amount_consumption | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 10682 | \n",
" 914 | \n",
" 114 | \n",
" 41 | \n",
" 10655 | \n",
" 1 | \n",
" 35c88f2db8a63d7474e46eb8ca9260e7 | \n",
" 9.0 | \n",
" False | \n",
" 0.0 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 478 | \n",
" 273 | \n",
" 131 | \n",
" 1 | \n",
" 471 | \n",
" 1 | \n",
" 8a179671ab198e570e6a104c4451379f | \n",
" 9.5 | \n",
" False | \n",
" 0.0 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 20873 | \n",
" 275 | \n",
" 137 | \n",
" 1 | \n",
" 20825 | \n",
" 1 | \n",
" ee83779ce29e67ad251e40234b426d6a | \n",
" 11.5 | \n",
" False | \n",
" 0.0 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 157142 | \n",
" 82519 | \n",
" 9 | \n",
" 5 | \n",
" 156773 | \n",
" 1 | \n",
" d865383579314b791aa4bcf3fb418f17 | \n",
" 8.0 | \n",
" False | \n",
" 0.0 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 1341 | \n",
" 9 | \n",
" 93 | \n",
" 1 | \n",
" 1175 | \n",
" 1 | \n",
" f1c4689bc47dee6f60b56d74b593dd46 | \n",
" 8.5 | \n",
" False | \n",
" 0.0 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id representation_id pricing_formula_id category_id \\\n",
"0 10682 914 114 41 \n",
"1 478 273 131 1 \n",
"2 20873 275 137 1 \n",
"3 157142 82519 9 5 \n",
"4 1341 9 93 1 \n",
"\n",
" products_group_id product_pack_id identifier \\\n",
"0 10655 1 35c88f2db8a63d7474e46eb8ca9260e7 \n",
"1 471 1 8a179671ab198e570e6a104c4451379f \n",
"2 20825 1 ee83779ce29e67ad251e40234b426d6a \n",
"3 156773 1 d865383579314b791aa4bcf3fb418f17 \n",
"4 1175 1 f1c4689bc47dee6f60b56d74b593dd46 \n",
"\n",
" amount is_full_price apply_price extra_field amount_consumption \n",
"0 9.0 False 0.0 NaN NaN \n",
"1 9.5 False 0.0 NaN NaN \n",
"2 11.5 False 0.0 NaN NaN \n",
"3 8.0 False 0.0 NaN NaN \n",
"4 8.5 False 0.0 NaN NaN "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"products = remove_horodates(products)\n",
"print(\"Number of columns : \", len(products.columns))\n",
"products = order_columns_id(products)\n",
"print(\"Columns : \", products.columns)\n",
"products.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "a383474f-7da9-422c-bb69-3f0cc0b7053f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id int64\n",
"representation_id int64\n",
"pricing_formula_id int64\n",
"category_id int64\n",
"products_group_id int64\n",
"product_pack_id int64\n",
"identifier object\n",
"amount float64\n",
"is_full_price bool\n",
"apply_price float64\n",
"extra_field float64\n",
"amount_consumption float64\n",
"dtype: object\n"
]
}
],
"source": [
"print(products.dtypes)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "460749ac-aa26-4216-8667-518546f72f72",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id 0.0\n",
"representation_id 0.0\n",
"pricing_formula_id 0.0\n",
"category_id 0.0\n",
"products_group_id 0.0\n",
"product_pack_id 0.0\n",
"identifier 0.0\n",
"amount 0.0\n",
"is_full_price 0.0\n",
"apply_price 0.0\n",
"extra_field 100.0\n",
"amount_consumption 100.0\n",
"dtype: float64\n"
]
}
],
"source": [
"percent_missing = products.isna().sum() * 100 / len(products)\n",
"print(percent_missing)"
]
},
{
"cell_type": "markdown",
"id": "ebcb48ab-adad-42e5-b5d7-7275771cd200",
"metadata": {},
"source": [
"#### Deep analysis of categories.csv"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "3efce2b6-2d2f-4da9-98ed-1aae17da624c",
"metadata": {},
"outputs": [],
"source": [
"name_dataset = '1categories.csv'"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "38aa39fd-58af-4fb8-98f2-4269dbaf35de",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1categories.csv\n",
"Shape : (27, 7)\n",
"Number of columns : 7\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" name | \n",
" created_at | \n",
" updated_at | \n",
" extra_field | \n",
" quota | \n",
" identifier | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 30 | \n",
" en nb entrées gr | \n",
" 2020-09-03 13:21:20.019202+02:00 | \n",
" 2020-09-03 13:21:20.019202+02:00 | \n",
" NaN | \n",
" NaN | \n",
" 849ab2791a14f5fc2bb4d87ab2b78bf6 | \n",
"
\n",
" \n",
" 1 | \n",
" 16 | \n",
" indiv activité enfant | \n",
" 2020-09-03 13:11:23.306968+02:00 | \n",
" 2020-09-03 13:11:23.306968+02:00 | \n",
" NaN | \n",
" NaN | \n",
" 425fd2f01984cc4ba030c1be98f42c33 | \n",
"
\n",
" \n",
" 2 | \n",
" 39 | \n",
" indiv activité gr | \n",
" 2020-09-03 13:21:20.029901+02:00 | \n",
" 2020-09-03 13:21:20.029901+02:00 | \n",
" NaN | \n",
" NaN | \n",
" 9244dd3738788db0d22a5d0afe687b69 | \n",
"
\n",
" \n",
" 3 | \n",
" 1108 | \n",
" groupe forfait adulte | \n",
" 2020-09-19 02:06:43.145697+02:00 | \n",
" 2020-09-19 02:06:43.145697+02:00 | \n",
" NaN | \n",
" NaN | \n",
" 3edda20c877a93b5ff883827238eb711 | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" groupe forfait entrées tr | \n",
" 2020-09-03 13:11:23.264997+02:00 | \n",
" 2020-09-03 13:11:23.264997+02:00 | \n",
" NaN | \n",
" NaN | \n",
" ff48df4b2dd5a14116bf4d280b31621e | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id name created_at \\\n",
"0 30 en nb entrées gr 2020-09-03 13:21:20.019202+02:00 \n",
"1 16 indiv activité enfant 2020-09-03 13:11:23.306968+02:00 \n",
"2 39 indiv activité gr 2020-09-03 13:21:20.029901+02:00 \n",
"3 1108 groupe forfait adulte 2020-09-19 02:06:43.145697+02:00 \n",
"4 6 groupe forfait entrées tr 2020-09-03 13:11:23.264997+02:00 \n",
"\n",
" updated_at extra_field quota \\\n",
"0 2020-09-03 13:21:20.019202+02:00 NaN NaN \n",
"1 2020-09-03 13:11:23.306968+02:00 NaN NaN \n",
"2 2020-09-03 13:21:20.029901+02:00 NaN NaN \n",
"3 2020-09-19 02:06:43.145697+02:00 NaN NaN \n",
"4 2020-09-03 13:11:23.264997+02:00 NaN NaN \n",
"\n",
" identifier \n",
"0 849ab2791a14f5fc2bb4d87ab2b78bf6 \n",
"1 425fd2f01984cc4ba030c1be98f42c33 \n",
"2 9244dd3738788db0d22a5d0afe687b69 \n",
"3 3edda20c877a93b5ff883827238eb711 \n",
"4 ff48df4b2dd5a14116bf4d280b31621e "
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = display_databases(name_dataset)\n",
"print(\"Number of columns : \", len(df.columns))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "99eb6d14-8b4b-4d55-8fc7-ddf2726096f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of columns : 5\n",
"Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
"Percent of NA for each column : id 0.000000\n",
"identifier 0.000000\n",
"name 3.703704\n",
"extra_field 100.000000\n",
"quota 100.000000\n",
"dtype: float64\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" identifier | \n",
" name | \n",
" extra_field | \n",
" quota | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 30 | \n",
" 849ab2791a14f5fc2bb4d87ab2b78bf6 | \n",
" en nb entrées gr | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 16 | \n",
" 425fd2f01984cc4ba030c1be98f42c33 | \n",
" indiv activité enfant | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 39 | \n",
" 9244dd3738788db0d22a5d0afe687b69 | \n",
" indiv activité gr | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 1108 | \n",
" 3edda20c877a93b5ff883827238eb711 | \n",
" groupe forfait adulte | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" 6 | \n",
" ff48df4b2dd5a14116bf4d280b31621e | \n",
" groupe forfait entrées tr | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id identifier name \\\n",
"0 30 849ab2791a14f5fc2bb4d87ab2b78bf6 en nb entrées gr \n",
"1 16 425fd2f01984cc4ba030c1be98f42c33 indiv activité enfant \n",
"2 39 9244dd3738788db0d22a5d0afe687b69 indiv activité gr \n",
"3 1108 3edda20c877a93b5ff883827238eb711 groupe forfait adulte \n",
"4 6 ff48df4b2dd5a14116bf4d280b31621e groupe forfait entrées tr \n",
"\n",
" extra_field quota \n",
"0 NaN NaN \n",
"1 NaN NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN "
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = process_df(df)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "c5f39cc9-dff8-452c-9a3e-9f7df81a8a19",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id int64\n",
"identifier object\n",
"name object\n",
"extra_field float64\n",
"quota float64\n",
"dtype: object"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "c4cb0b37-2262-45c0-97be-b12c503016e3",
"metadata": {},
"source": [
"#### Deep analysis of type_of_categories.csv"
]
},
{
"cell_type": "markdown",
"id": "3b4a3af9-ed12-43ec-b17e-fd425b238265",
"metadata": {},
"source": [
"#### Deep analysis of representation_category_capacities.csv"
]
},
{
"cell_type": "markdown",
"id": "135966fb-aab1-48d7-bb4c-39a53ee643ca",
"metadata": {},
"source": [
"#### Deep analysis of representations.csv"
]
},
{
"cell_type": "markdown",
"id": "b480f39f-d5c7-4ded-8f64-ea8ac31f5db5",
"metadata": {},
"source": [
"#### Deep analysis of events.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d52d6da-cca5-4abd-be05-2f00fd3eca8e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}