BDC-team-1/Notebook_AR.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "455cc769-1b3b-4fef-b395-e74a988ceed3",
   "metadata": {},
   "source": [
    "## Notebook Alexis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20eeb149-6618-4ef2-9cfd-ff062950f36c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import s3fs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30494c5e-9649-4fff-8708-617544188b20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
    "\n",
    "BUCKET = \"bdc2324-data\"\n",
    "fs.ls(BUCKET)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2feffee9-9f23-4caa-8a01-9e4a93abbf5d",
   "metadata": {},
   "source": [
    "###  I. Analyse fichier 8"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f54ba449-2051-4acd-939d-d30abd5452fe",
   "metadata": {},
   "source": [
    "This section describes the databases associated with company 8. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1cce705-46e1-42de-8e93-2ee15312d288",
   "metadata": {},
   "outputs": [],
   "source": [
    "directory_path = '8'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check the files in the directory\n",
    "\n",
    "objects = fs.ls(f'{BUCKET}/{directory_path}')\n",
    "\n",
    "for file in objects:\n",
    "    print(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65cb38ad-52ae-4266-85d8-c47d81b00283",
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_databases(file_name):\n",
    "    \"\"\"\n",
    "    This function returns the file from s3 storage\n",
    "    \"\"\"\n",
    "    file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "    print(\"File path : \", file_path)\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "    print(\"Shape : \", df.shape)\n",
    "    return df\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddd545ef-7e9f-4696-962a-115294991641",
   "metadata": {},
   "source": [
    "#### Lookt at campaigns files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0214d30d-5f83-498f-867f-e67b5793b731",
   "metadata": {},
   "outputs": [],
   "source": [
    "campaigns = display_databases(\"8campaigns.csv\")\n",
    "campaigns.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7982be4-2c42-4a91-be5a-329a999644cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "campaign_stats = display_databases(\"8campaign_stats.csv\")\n",
    "campaign_stats.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6512bc9-91f5-4fe4-a637-a4e84dc497a9",
   "metadata": {},
   "source": [
    "#### Look at links files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28e7c1fe-470f-4d84-87b8-a711a973500b",
   "metadata": {},
   "source": [
    "There is no links file for these company. Only the link_stats file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e973575b-4ed6-4b23-8024-f383ac82e87c",
   "metadata": {},
   "outputs": [],
   "source": [
    "links_stats = display_databases(\"8link_stats.csv\")\n",
    "links_stats.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8dfcca1f-1323-413f-aa8d-3ee5ce2610a8",
   "metadata": {},
   "source": [
    "#### Analyse Customersplus file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b523575-c779-451c-a12e-a36fb4ad232c",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8customersplus.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "    customersplus = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "customersplus.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe56785a-ed3c-4322-aafa-a630f97b836f",
   "metadata": {},
   "source": [
    "#### Analyse Structures files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87d801fc-d19a-4c45-9b21-9b6d7a8451fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8structures.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "try:\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        structures = pd.read_csv(file_in, sep=\",\")\n",
    "except:\n",
    "    print(\"No structures database\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8452558-2d32-459b-91e7-f6042345e465",
   "metadata": {},
   "source": [
    "For Stade Français, there is no structures, tags and structure_tag_mapping databases"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "285b1422-9ca9-4afd-b752-777a54aaa677",
   "metadata": {},
   "source": [
    "#### Analyze Target databases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6e4c3ea-5ccf-4aec-bd2d-79a5a1194178",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8customer_target_mappings.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "try:\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        customer_targets = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "except:\n",
    "    print(\"No such database in s3\")\n",
    "\n",
    "print(\"Shape : \", customer_targets.shape)\n",
    "customer_targets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e81a35c-3c6f-403d-9ebd-e8399ecd4263",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8targets.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "try:\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        targets = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "except:\n",
    "    print(\"No such database in s3\")\n",
    "\n",
    "print(\"Shape : \", targets.shape)\n",
    "targets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85696d74-3b2f-4368-9045-44db5322b60d",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8target_types.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "try:\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        target_types = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "except:\n",
    "    print(\"No such database in s3\")\n",
    "\n",
    "print(\"Shape : \", target_types.shape)\n",
    "target_types.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdc6416b-3deb-446c-8957-435745b93533",
   "metadata": {},
   "source": [
    "#### Analyze consumption files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8622bd5-a5ab-403f-ab01-758aec879ee4",
   "metadata": {},
   "source": [
    "Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv\n",
    "\n",
    "However, there is no consumptions.csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c57529b-2ffb-4039-9795-b27c6fbd54a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "purchases = display_databases(\"8purchases.csv\")\n",
    "purchases.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "903321fb-99f8-475d-b4a6-c70ec2efe190",
   "metadata": {},
   "outputs": [],
   "source": [
    "tickets = display_databases(\"8tickets.csv\")\n",
    "tickets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "243e6942-0233-4cd5-b32b-e005457131d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "suppliers = display_databases(\"8suppliers.csv\")\n",
    "suppliers.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd8c876a-f0c5-4123-a422-c267af5f29b1",
   "metadata": {},
   "source": [
    "#### Analyse product file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b82efce-1dee-4d89-8585-28c4ad477eef",
   "metadata": {},
   "outputs": [],
   "source": [
    "products = display_databases(\"8products.csv\")\n",
    "products.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ad143b2-2869-4bd2-982e-688498b98727",
   "metadata": {},
   "source": [
    "#### Analyze pricing files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a54e9a5-801d-4000-9e76-e792edbf7e41",
   "metadata": {},
   "source": [
    "Meaning pricing_formulas.csv and type_of_pricing_formulas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "daf37bff-a26d-4ff5-ad50-c90f917164bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "pricing_formulas = display_databases(\"8pricing_formulas.csv\")\n",
    "pricing_formulas.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdb14488-b093-4b39-84fa-1c2b4576208f",
   "metadata": {},
   "outputs": [],
   "source": [
    "type_pricing_formulas = display_databases(\"8type_of_pricing_formulas.csv\")\n",
    "type_pricing_formulas.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a084297a-4fd7-4cda-b513-7704f4244a5c",
   "metadata": {},
   "source": [
    "#### Analyze type of products"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76a67ea7-8720-441e-8973-23e5d105370e",
   "metadata": {},
   "source": [
    "Meaning categories.csv, type_of_categories.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6582694d-5339-4f33-a943-c73033121a90",
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = display_databases(\"8categories.csv\")\n",
    "categories.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "589076df-1958-42de-9941-1aff9fa8536f",
   "metadata": {},
   "outputs": [],
   "source": [
    "type_categories = display_databases(\"8type_of_categories.csv\")\n",
    "type_categories.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3427b681-4c05-4e4e-9c2b-867ee789f98c",
   "metadata": {},
   "source": [
    "#### Analyze type of representations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9381e36b-090a-44c5-a29d-3ac4c9a4431e",
   "metadata": {},
   "source": [
    "Meaning representation_category_capacities.csv, representations.csv, representations_types.csv\n",
    "\n",
    "however there is no representation_types database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f06d72a-5725-4eee-8e4c-e9ef5820f346",
   "metadata": {},
   "outputs": [],
   "source": [
    "representation_category_capacities = display_databases(\"8representation_category_capacities.csv\")\n",
    "representation_category_capacities.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd405913-033d-4f15-a5b9-103d577baaff",
   "metadata": {},
   "outputs": [],
   "source": [
    "representations = display_databases(\"8representations.csv\")\n",
    "representations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f2c7ea3-6964-48fd-9411-17547b2c3a3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#representation_type = display_databases(\"8representation_types.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9b02406-2a69-4431-8d49-3c6bd6a5e1c7",
   "metadata": {},
   "source": [
    "#### Analyze type of events"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d554266-282c-4f64-9a0f-ddcf591ec912",
   "metadata": {},
   "source": [
    "Meaning events.csv, event_types.csv, seasons.csv and facilities.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cba22ee2-338d-4ce1-a1e8-829a11a94bcf",
   "metadata": {},
   "outputs": [],
   "source": [
    "events = display_databases(\"8events.csv\")\n",
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3db00b9d-2187-4cb6-980d-8ac6ab9eb460",
   "metadata": {},
   "outputs": [],
   "source": [
    "event_types = display_databases(\"8event_types.csv\")\n",
    "event_types.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cba0ee58-6280-45fe-99b3-0be09db5922b",
   "metadata": {},
   "outputs": [],
   "source": [
    "seasons = display_databases(\"8seasons.csv\")\n",
    "seasons.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fa82fd7-d6d3-4857-af24-ea573b1129d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "facilities = display_databases(\"8facilities.csv\")\n",
    "facilities.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7467d41-0ded-465d-bb08-15be914a166b",
   "metadata": {},
   "source": [
    "#### Analyze annexe databases"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17e9e334-0ae4-48d8-bed5-b50b4af49d5b",
   "metadata": {},
   "source": [
    "Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d3ec1040-48b2-40bb-8947-920ddb4589f3",
   "metadata": {},
   "source": [
    "## II. Identify Commons Datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec528a8a-df38-48e2-a1be-4a1459a80a1e",
   "metadata": {},
   "source": [
    "From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c240b811-48a6-4501-9e70-bc51d69e3ac4",
   "metadata": {},
   "outputs": [],
   "source": [
    "## We first construct a dictionary reporting all the datasets for each companies\n",
    "\n",
    "companies = fs.ls(BUCKET)\n",
    "companies_database = {}\n",
    "\n",
    "for company in companies:\n",
    "    companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54057367-9df9-42f4-aa07-bf524bb76462",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Then we create a list of all database\n",
    "\n",
    "all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n",
    "print(\"Number of databases : \",len(all_database))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63914e20-9efc-4088-877b-edab5f225d00",
   "metadata": {},
   "outputs": [],
   "source": [
    "## We then create a set of database in common for all companies\n",
    "\n",
    "data_in_common = set(all_database)\n",
    "\n",
    "print(len(data_in_common))\n",
    "\n",
    "for key in companies_database:\n",
    "    diff_database = data_in_common.symmetric_difference(companies_database[key])\n",
    "    data_in_common = data_in_common - diff_database\n",
    "\n",
    "print(len(data_in_common))\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "676d8536-7d8c-4075-a357-b8d06e501ca8",
   "metadata": {},
   "source": [
    "## Create Universal database"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e460fbe-5067-4998-a1a8-9e3d07401750",
   "metadata": {},
   "source": [
    "We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.\n",
    "\n",
    "Let's first create our procedure for the company 1 and the datasets belongings to the theme producst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "590a132a-4f57-4ea3-a282-2ef913e4b753",
   "metadata": {},
   "outputs": [],
   "source": [
    "directory_path = '1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "theme_products = [\"products.csv\" ,\"categories.csv\", \"type_of_categories.csv\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_horodates(df):\n",
    "    \"\"\"\n",
    "    this function remove horodate columns like created_at and updated_at\n",
    "    \"\"\"\n",
    "    df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c478213-09ae-44ef-8c7c-125bcb571642",
   "metadata": {},
   "outputs": [],
   "source": [
    "def order_columns_id(df):\n",
    "    \"\"\"\n",
    "    this function puts all id columns at the beginning in order to read the dataset easier\n",
    "    \"\"\"\n",
    "    substring = 'id'\n",
    "    id_columns = [col for col in df.columns if substring in col]\n",
    "    remaining_col = [col for col in df.columns if substring not in col]\n",
    "    new_order = id_columns + remaining_col\n",
    "    return df[new_order]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "327e44b0-eb99-4022-b4ca-79548072f0f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def percent_na(df):\n",
    "    \"\"\"\n",
    "    this function returns the percentage of na for each column\n",
    "    \"\"\"\n",
    "    percent_missing = df.isna().sum() * 100 / len(df)\n",
    "    return percent_missing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10926def-267f-4e86-b2c9-72e27ff9a9df",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_df(df):\n",
    "    df = remove_horodates(df)\n",
    "    print(\"Number of columns : \", len(df.columns))\n",
    "    df = order_columns_id(df)\n",
    "    print(\"Columns : \", df.columns)\n",
    "    print(\"Percent of NA for each column : \", percent_na(df))\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "98ac02cb-5295-47ca-99c6-99e622c5f388",
   "metadata": {},
   "source": [
    "#### Deep analysis of products.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "862a7658-0602-4d94-bb58-d23774c00d32",
   "metadata": {},
   "outputs": [],
   "source": [
    "products = display_databases(\"1products.csv\")\n",
    "print(\"Number of columns : \", len(products.columns))\n",
    "products.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28",
   "metadata": {},
   "outputs": [],
   "source": [
    "products = remove_horodates(products)\n",
    "print(\"Number of columns : \", len(products.columns))\n",
    "products = order_columns_id(products)\n",
    "print(\"Columns : \", products.columns)\n",
    "products.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a383474f-7da9-422c-bb69-3f0cc0b7053f",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(products.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "460749ac-aa26-4216-8667-518546f72f72",
   "metadata": {},
   "outputs": [],
   "source": [
    "percent_missing = products.isna().sum() * 100 / len(products)\n",
    "print(percent_missing)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebcb48ab-adad-42e5-b5d7-7275771cd200",
   "metadata": {},
   "source": [
    "#### Deep analysis of categories.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3efce2b6-2d2f-4da9-98ed-1aae17da624c",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1categories.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38aa39fd-58af-4fb8-98f2-4269dbaf35de",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99eb6d14-8b4b-4d55-8fc7-ddf2726096f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5f39cc9-dff8-452c-9a3e-9f7df81a8a19",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4cb0b37-2262-45c0-97be-b12c503016e3",
   "metadata": {},
   "source": [
    "#### Deep analysis of type_of_categories.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b4a3af9-ed12-43ec-b17e-fd425b238265",
   "metadata": {},
   "source": [
    "#### Deep analysis of representation_category_capacities.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "135966fb-aab1-48d7-bb4c-39a53ee643ca",
   "metadata": {},
   "source": [
    "#### Deep analysis of representations.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b480f39f-d5c7-4ded-8f64-ea8ac31f5db5",
   "metadata": {},
   "source": [
    "#### Deep analysis of events.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d52d6da-cca5-4abd-be05-2f00fd3eca8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1events.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6cab507d-8b11-404d-9286-5cc205228af9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fe57873-8108-44c9-b8a5-f58d3cbb6d17",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7fd9e5bd-baac-4b3b-9ffb-5a9baa18399b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24186efa-5908-4b03-bf52-96415fc8bd54",
   "metadata": {},
   "source": [
    "#### Deep analysis of event_types.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90ab62d4-a086-4469-961c-67eefb375388",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1event_types.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58db1751-fd56-4c28-b49e-bc8235bb0dc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac93382c-0b5f-462d-8021-0dd1e7201b8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18cbd630-3c7d-49e1-932b-9460badf3758",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5847a441-31b9-4802-a5ae-90d8c6d6e153",
   "metadata": {},
   "source": [
    "#### Deep analysis of seasons.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae544dcc-f23d-4216-bb5b-597cc1b3765e",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1seasons.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ac97963-9208-4329-be41-d71a5797487f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4593d46-105c-47dd-aa71-babd8e63e65b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5d3b096d-8e73-4514-94e5-f2dcd4d0a89c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7b00bc7-eae6-457c-ac68-a4a55a6d1c8c",
   "metadata": {},
   "source": [
    "#### Deep Analysis of facilities.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d95ef015-d44c-4353-8761-771b910d21c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1facilities.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef5fe794-8df7-4f27-8554-ecdc4074ac0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3621201-fab9-49fd-95c1-0b9d5da76e50",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b198b92-8654-4531-a0dd-8f2e01c2e6c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab5c4c2d-3e04-457d-a183-e173df89b650",
   "metadata": {},
   "source": [
    "## Merge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43576244-c8cf-4ca0-b056-7aea1fbf0bc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_df_2(df):\n",
    "    df = remove_horodates(df)\n",
    "    print(\"Number of columns : \", len(df.columns))\n",
    "    df = order_columns_id(df)\n",
    "    print(\"Columns : \", df.columns)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fad097e-474c-4af7-b1e1-7d8dda3f09ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset(name):\n",
    "    df = display_databases(name)\n",
    "    df = process_df_2(df)\n",
    "    # drop na :\n",
    "    #df = df.dropna(axis=1, thresh=len(df))\n",
    "    # if identifier in table : delete it\n",
    "    if 'identifier' in df.columns:\n",
    "        df = df.drop(columns = 'identifier')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b60034ef-fdd6-4640-a012-cf74c17b333f",
   "metadata": {},
   "source": [
    "### Products Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6213b1eb-c5f8-49dd-ab69-366542380e80",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_products_table():\n",
    "    # first merge products and categories\n",
    "    print(\"first merge products and categories\")\n",
    "    products = load_dataset(\"1products.csv\")\n",
    "    categories = load_dataset(\"1categories.csv\")\n",
    "    products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
    "                                    right_on = 'id', suffixes=('_products', '_categories'))\n",
    "    products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
    "    # Second merge products_theme and type of categories\n",
    "    print(\"Second merge products_theme and type of categories\")\n",
    "    type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
    "    type_of_categories = type_of_categories.drop(columns = 'id')\n",
    "    products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
    "                                          right_on = 'category_id' )\n",
    "\n",
    "    # Index cleaning\n",
    "    products_theme = products_theme.drop(columns = ['id_categories'])\n",
    "    products_theme  = order_columns_id(products_theme)\n",
    "\n",
    "    return products_theme"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b853e020-f73d-44e8-b086-e5548ce21011",
   "metadata": {},
   "outputs": [],
   "source": [
    "products_theme = create_products_table()\n",
    "products_theme.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bd7b7ab-fd04-48d2-898b-48c5815457f3",
   "metadata": {},
   "source": [
    "### Events Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ed0ad20-8315-4112-9a85-10e5f04ef852",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_events_table():\n",
    "    # first merge events and seasons : \n",
    "    print(\"first merge events and seasons : \")\n",
    "    events = load_dataset(\"1events.csv\")\n",
    "    seasons = load_dataset(\"1seasons.csv\")\n",
    "    events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
    "\n",
    "    # Secondly merge events_theme and event_types\n",
    "    print(\"Secondly merge events_theme and event_types : \")\n",
    "    event_types = load_dataset(\"1event_types.csv\")\n",
    "\n",
    "    events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
    "    events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
    "    events_theme = events_theme.drop(columns = 'id')\n",
    "\n",
    "    # thirdly merge events_theme and facilities\n",
    "    print(\"thirdly merge events_theme and facilities : \")\n",
    "    facilities = load_dataset(\"1facilities.csv\")\n",
    "    events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
    "    events_theme = events_theme.rename(columns = {\"name\" : \"name_facilties\"})\n",
    "    events_theme = events_theme.drop(columns = 'id')\n",
    "\n",
    "    # Index cleaning\n",
    "    events_theme = events_theme.drop(columns = ['id_seasons'])\n",
    "    events_theme  = order_columns_id(events_theme)\n",
    "    return events_theme"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98ef0636-8c45-4a23-a62a-1fbe1544f8ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "events_theme= create_events_table()\n",
    "events_theme.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ad5b680-bb27-4f86-a5f3-7ff4fd1be96a",
   "metadata": {},
   "source": [
    "## Representations_Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "481dddd6-80a8-4b9e-a05e-ed06fa3ed7a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_representations_table():\n",
    "    representations = load_dataset(\"1representations.csv\")\n",
    "    representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
    "\n",
    "    representations_theme = representations.merge(representations_capacity, how='left',\n",
    "                                                  left_on='id', right_on='representation_id',\n",
    "                                                  suffixes=('_representation', '_representation_cap'))\n",
    "    # index cleaning\n",
    "    representations_theme = representations_theme.drop(columns = [\"representation_id\"])\n",
    "    representations_theme = order_columns_id(representations_theme)\n",
    "    return representations_theme"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "677f4ed8-ef58-45f2-9056-ede0898c6a64",
   "metadata": {},
   "outputs": [],
   "source": [
    "rep = create_representations_table()\n",
    "rep.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b26f4e7e-134d-4e32-a615-4b0e6bb80b25",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}