BDC-team-1/Notebook_AR.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "455cc769-1b3b-4fef-b395-e74a988ceed3",
   "metadata": {},
   "source": [
    "## Notebook Alexis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "20eeb149-6618-4ef2-9cfd-ff062950f36c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import s3fs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "30494c5e-9649-4fff-8708-617544188b20",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bdc2324-data/1',\n",
       " 'bdc2324-data/10',\n",
       " 'bdc2324-data/101',\n",
       " 'bdc2324-data/11',\n",
       " 'bdc2324-data/12',\n",
       " 'bdc2324-data/13',\n",
       " 'bdc2324-data/14',\n",
       " 'bdc2324-data/2',\n",
       " 'bdc2324-data/3',\n",
       " 'bdc2324-data/4',\n",
       " 'bdc2324-data/5',\n",
       " 'bdc2324-data/6',\n",
       " 'bdc2324-data/7',\n",
       " 'bdc2324-data/8',\n",
       " 'bdc2324-data/9']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
    "\n",
    "BUCKET = \"bdc2324-data\"\n",
    "fs.ls(BUCKET)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2feffee9-9f23-4caa-8a01-9e4a93abbf5d",
   "metadata": {},
   "source": [
    "###  I. Analyse fichier 8"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f54ba449-2051-4acd-939d-d30abd5452fe",
   "metadata": {},
   "source": [
    "This section describes the databases associated with company 8. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f1cce705-46e1-42de-8e93-2ee15312d288",
   "metadata": {},
   "outputs": [],
   "source": [
    "directory_path = '8'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bdc2324-data/8/8campaign_stats.csv\n",
      "bdc2324-data/8/8campaigns.csv\n",
      "bdc2324-data/8/8categories.csv\n",
      "bdc2324-data/8/8countries.csv\n",
      "bdc2324-data/8/8currencies.csv\n",
      "bdc2324-data/8/8customer_target_mappings.csv\n",
      "bdc2324-data/8/8customersplus.csv\n",
      "bdc2324-data/8/8event_types.csv\n",
      "bdc2324-data/8/8events.csv\n",
      "bdc2324-data/8/8facilities.csv\n",
      "bdc2324-data/8/8link_stats.csv\n",
      "bdc2324-data/8/8pricing_formulas.csv\n",
      "bdc2324-data/8/8product_packs.csv\n",
      "bdc2324-data/8/8products.csv\n",
      "bdc2324-data/8/8products_groups.csv\n",
      "bdc2324-data/8/8purchases.csv\n",
      "bdc2324-data/8/8representation_category_capacities.csv\n",
      "bdc2324-data/8/8representations.csv\n",
      "bdc2324-data/8/8seasons.csv\n",
      "bdc2324-data/8/8suppliers.csv\n",
      "bdc2324-data/8/8target_types.csv\n",
      "bdc2324-data/8/8targets.csv\n",
      "bdc2324-data/8/8tickets.csv\n",
      "bdc2324-data/8/8type_of_categories.csv\n",
      "bdc2324-data/8/8type_of_pricing_formulas.csv\n",
      "bdc2324-data/8/8type_ofs.csv\n"
     ]
    }
   ],
   "source": [
    "# check the files in the directory\n",
    "\n",
    "objects = fs.ls(f'{BUCKET}/{directory_path}')\n",
    "\n",
    "for file in objects:\n",
    "    print(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "65cb38ad-52ae-4266-85d8-c47d81b00283",
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_databases(file_name):\n",
    "    \"\"\"\n",
    "    This function returns the file from s3 storage\n",
    "    \"\"\"\n",
    "    file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "    print(\"File path : \", file_path)\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "    print(\"Shape : \", df.shape)\n",
    "    return df\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddd545ef-7e9f-4696-962a-115294991641",
   "metadata": {},
   "source": [
    "#### Lookt at campaigns files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0214d30d-5f83-498f-867f-e67b5793b731",
   "metadata": {},
   "outputs": [],
   "source": [
    "campaigns = display_databases(\"8campaigns.csv\")\n",
    "campaigns.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7982be4-2c42-4a91-be5a-329a999644cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "campaign_stats = display_databases(\"8campaign_stats.csv\")\n",
    "campaign_stats.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6512bc9-91f5-4fe4-a637-a4e84dc497a9",
   "metadata": {},
   "source": [
    "#### Look at links files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28e7c1fe-470f-4d84-87b8-a711a973500b",
   "metadata": {},
   "source": [
    "There is no links file for these company. Only the link_stats file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e973575b-4ed6-4b23-8024-f383ac82e87c",
   "metadata": {},
   "outputs": [],
   "source": [
    "links_stats = display_databases(\"8link_stats.csv\")\n",
    "links_stats.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8dfcca1f-1323-413f-aa8d-3ee5ce2610a8",
   "metadata": {},
   "source": [
    "#### Analyse Customersplus file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b523575-c779-451c-a12e-a36fb4ad232c",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8customersplus.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "    customersplus = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "customersplus.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe56785a-ed3c-4322-aafa-a630f97b836f",
   "metadata": {},
   "source": [
    "#### Analyse Structures files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87d801fc-d19a-4c45-9b21-9b6d7a8451fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8structures.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "try:\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        structures = pd.read_csv(file_in, sep=\",\")\n",
    "except:\n",
    "    print(\"No structures database\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8452558-2d32-459b-91e7-f6042345e465",
   "metadata": {},
   "source": [
    "For Stade Français, there is no structures, tags and structure_tag_mapping databases"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "285b1422-9ca9-4afd-b752-777a54aaa677",
   "metadata": {},
   "source": [
    "#### Analyze Target databases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6e4c3ea-5ccf-4aec-bd2d-79a5a1194178",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8customer_target_mappings.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "try:\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        customer_targets = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "except:\n",
    "    print(\"No such database in s3\")\n",
    "\n",
    "print(\"Shape : \", customer_targets.shape)\n",
    "customer_targets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e81a35c-3c6f-403d-9ebd-e8399ecd4263",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8targets.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "try:\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        targets = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "except:\n",
    "    print(\"No such database in s3\")\n",
    "\n",
    "print(\"Shape : \", targets.shape)\n",
    "targets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85696d74-3b2f-4368-9045-44db5322b60d",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_name = \"8target_types.csv\"\n",
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
    "print(file_path)\n",
    "try:\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        target_types = pd.read_csv(file_in, sep=\",\")\n",
    "        \n",
    "except:\n",
    "    print(\"No such database in s3\")\n",
    "\n",
    "print(\"Shape : \", target_types.shape)\n",
    "target_types.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdc6416b-3deb-446c-8957-435745b93533",
   "metadata": {},
   "source": [
    "#### Analyze consumption files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8622bd5-a5ab-403f-ab01-758aec879ee4",
   "metadata": {},
   "source": [
    "Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv\n",
    "\n",
    "However, there is no consumptions.csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c57529b-2ffb-4039-9795-b27c6fbd54a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "purchases = display_databases(\"8purchases.csv\")\n",
    "purchases.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "903321fb-99f8-475d-b4a6-c70ec2efe190",
   "metadata": {},
   "outputs": [],
   "source": [
    "tickets = display_databases(\"8tickets.csv\")\n",
    "tickets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "243e6942-0233-4cd5-b32b-e005457131d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "suppliers = display_databases(\"8suppliers.csv\")\n",
    "suppliers.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd8c876a-f0c5-4123-a422-c267af5f29b1",
   "metadata": {},
   "source": [
    "#### Analyse product file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b82efce-1dee-4d89-8585-28c4ad477eef",
   "metadata": {},
   "outputs": [],
   "source": [
    "products = display_databases(\"8products.csv\")\n",
    "products.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ad143b2-2869-4bd2-982e-688498b98727",
   "metadata": {},
   "source": [
    "#### Analyze pricing files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a54e9a5-801d-4000-9e76-e792edbf7e41",
   "metadata": {},
   "source": [
    "Meaning pricing_formulas.csv and type_of_pricing_formulas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "daf37bff-a26d-4ff5-ad50-c90f917164bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "pricing_formulas = display_databases(\"8pricing_formulas.csv\")\n",
    "pricing_formulas.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdb14488-b093-4b39-84fa-1c2b4576208f",
   "metadata": {},
   "outputs": [],
   "source": [
    "type_pricing_formulas = display_databases(\"8type_of_pricing_formulas.csv\")\n",
    "type_pricing_formulas.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a084297a-4fd7-4cda-b513-7704f4244a5c",
   "metadata": {},
   "source": [
    "#### Analyze type of products"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76a67ea7-8720-441e-8973-23e5d105370e",
   "metadata": {},
   "source": [
    "Meaning categories.csv, type_of_categories.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6582694d-5339-4f33-a943-c73033121a90",
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = display_databases(\"8categories.csv\")\n",
    "categories.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "589076df-1958-42de-9941-1aff9fa8536f",
   "metadata": {},
   "outputs": [],
   "source": [
    "type_categories = display_databases(\"8type_of_categories.csv\")\n",
    "type_categories.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3427b681-4c05-4e4e-9c2b-867ee789f98c",
   "metadata": {},
   "source": [
    "#### Analyze type of representations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9381e36b-090a-44c5-a29d-3ac4c9a4431e",
   "metadata": {},
   "source": [
    "Meaning representation_category_capacities.csv, representations.csv, representations_types.csv\n",
    "\n",
    "however there is no representation_types database"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f06d72a-5725-4eee-8e4c-e9ef5820f346",
   "metadata": {},
   "outputs": [],
   "source": [
    "representation_category_capacities = display_databases(\"8representation_category_capacities.csv\")\n",
    "representation_category_capacities.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd405913-033d-4f15-a5b9-103d577baaff",
   "metadata": {},
   "outputs": [],
   "source": [
    "representations = display_databases(\"8representations.csv\")\n",
    "representations.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f2c7ea3-6964-48fd-9411-17547b2c3a3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#representation_type = display_databases(\"8representation_types.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9b02406-2a69-4431-8d49-3c6bd6a5e1c7",
   "metadata": {},
   "source": [
    "#### Analyze type of events"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d554266-282c-4f64-9a0f-ddcf591ec912",
   "metadata": {},
   "source": [
    "Meaning events.csv, event_types.csv, seasons.csv and facilities.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cba22ee2-338d-4ce1-a1e8-829a11a94bcf",
   "metadata": {},
   "outputs": [],
   "source": [
    "events = display_databases(\"8events.csv\")\n",
    "events.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3db00b9d-2187-4cb6-980d-8ac6ab9eb460",
   "metadata": {},
   "outputs": [],
   "source": [
    "event_types = display_databases(\"8event_types.csv\")\n",
    "event_types.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cba0ee58-6280-45fe-99b3-0be09db5922b",
   "metadata": {},
   "outputs": [],
   "source": [
    "seasons = display_databases(\"8seasons.csv\")\n",
    "seasons.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fa82fd7-d6d3-4857-af24-ea573b1129d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "facilities = display_databases(\"8facilities.csv\")\n",
    "facilities.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7467d41-0ded-465d-bb08-15be914a166b",
   "metadata": {},
   "source": [
    "#### Analyze annexe databases"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17e9e334-0ae4-48d8-bed5-b50b4af49d5b",
   "metadata": {},
   "source": [
    "Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d3ec1040-48b2-40bb-8947-920ddb4589f3",
   "metadata": {},
   "source": [
    "## II. Identify Commons Datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec528a8a-df38-48e2-a1be-4a1459a80a1e",
   "metadata": {},
   "source": [
    "From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c240b811-48a6-4501-9e70-bc51d69e3ac4",
   "metadata": {},
   "outputs": [],
   "source": [
    "## We first construct a dictionary reporting all the datasets for each companies\n",
    "\n",
    "companies = fs.ls(BUCKET)\n",
    "companies_database = {}\n",
    "\n",
    "for company in companies:\n",
    "    companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "54057367-9df9-42f4-aa07-bf524bb76462",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of databases :  30\n"
     ]
    }
   ],
   "source": [
    "# Then we create a list of all database\n",
    "\n",
    "all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n",
    "print(\"Number of databases : \",len(all_database))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "63914e20-9efc-4088-877b-edab5f225d00",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "30\n",
      "23\n"
     ]
    }
   ],
   "source": [
    "## We then create a set of database in common for all companies\n",
    "\n",
    "data_in_common = set(all_database)\n",
    "\n",
    "print(len(data_in_common))\n",
    "\n",
    "for key in companies_database:\n",
    "    diff_database = data_in_common.symmetric_difference(companies_database[key])\n",
    "    data_in_common = data_in_common - diff_database\n",
    "\n",
    "print(len(data_in_common))\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "676d8536-7d8c-4075-a357-b8d06e501ca8",
   "metadata": {},
   "source": [
    "## Create Universal database"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e460fbe-5067-4998-a1a8-9e3d07401750",
   "metadata": {},
   "source": [
    "We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.\n",
    "\n",
    "Let's first create our procedure for the company 1 and the datasets belongings to the theme producst"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "590a132a-4f57-4ea3-a282-2ef913e4b753",
   "metadata": {},
   "outputs": [],
   "source": [
    "directory_path = '1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "theme_products = [\"products.csv\" ,\"categories.csv\", \"type_of_categories.csv\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_horodates(df):\n",
    "    \"\"\"\n",
    "    this function remove horodate columns like created_at and updated_at\n",
    "    \"\"\"\n",
    "    df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "2c478213-09ae-44ef-8c7c-125bcb571642",
   "metadata": {},
   "outputs": [],
   "source": [
    "def order_columns_id(df):\n",
    "    \"\"\"\n",
    "    this function puts all id columns at the beginning in order to read the dataset easier\n",
    "    \"\"\"\n",
    "    substring = 'id'\n",
    "    id_columns = [col for col in df.columns if substring in col]\n",
    "    remaining_col = [col for col in df.columns if substring not in col]\n",
    "    new_order = id_columns + remaining_col\n",
    "    return df[new_order]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "327e44b0-eb99-4022-b4ca-79548072f0f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def percent_na(df):\n",
    "    \"\"\"\n",
    "    this function returns the percentage of na for each column\n",
    "    \"\"\"\n",
    "    percent_missing = df.isna().sum() * 100 / len(df)\n",
    "    return percent_missing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "10926def-267f-4e86-b2c9-72e27ff9a9df",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_df(df):\n",
    "    df = remove_horodates(df)\n",
    "    print(\"Number of columns : \", len(df.columns))\n",
    "    df = order_columns_id(df)\n",
    "    print(\"Columns : \", df.columns)\n",
    "    print(\"Percent of NA for each column : \", percent_na(df))\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "98ac02cb-5295-47ca-99c6-99e622c5f388",
   "metadata": {},
   "source": [
    "#### Deep analysis of products.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "862a7658-0602-4d94-bb58-d23774c00d32",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  bdc2324-data/1/1products.csv\n",
      "Shape :  (94803, 14)\n",
      "Number of columns :  14\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>amount</th>\n",
       "      <th>is_full_price</th>\n",
       "      <th>representation_id</th>\n",
       "      <th>pricing_formula_id</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "      <th>category_id</th>\n",
       "      <th>apply_price</th>\n",
       "      <th>products_group_id</th>\n",
       "      <th>product_pack_id</th>\n",
       "      <th>extra_field</th>\n",
       "      <th>amount_consumption</th>\n",
       "      <th>identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10682</td>\n",
       "      <td>9.0</td>\n",
       "      <td>False</td>\n",
       "      <td>914</td>\n",
       "      <td>114</td>\n",
       "      <td>2020-09-03 14:09:43.119798+02:00</td>\n",
       "      <td>2020-09-03 14:09:43.119798+02:00</td>\n",
       "      <td>41</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10655</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>35c88f2db8a63d7474e46eb8ca9260e7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>478</td>\n",
       "      <td>9.5</td>\n",
       "      <td>False</td>\n",
       "      <td>273</td>\n",
       "      <td>131</td>\n",
       "      <td>2020-09-03 13:21:22.711773+02:00</td>\n",
       "      <td>2020-09-03 13:21:22.711773+02:00</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>471</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8a179671ab198e570e6a104c4451379f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20873</td>\n",
       "      <td>11.5</td>\n",
       "      <td>False</td>\n",
       "      <td>275</td>\n",
       "      <td>137</td>\n",
       "      <td>2020-09-03 14:46:33.589030+02:00</td>\n",
       "      <td>2020-09-03 14:46:33.589030+02:00</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>20825</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ee83779ce29e67ad251e40234b426d6a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>157142</td>\n",
       "      <td>8.0</td>\n",
       "      <td>False</td>\n",
       "      <td>82519</td>\n",
       "      <td>9</td>\n",
       "      <td>2022-01-28 19:29:23.525722+01:00</td>\n",
       "      <td>2022-01-28 19:29:23.525722+01:00</td>\n",
       "      <td>5</td>\n",
       "      <td>0.0</td>\n",
       "      <td>156773</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>d865383579314b791aa4bcf3fb418f17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1341</td>\n",
       "      <td>8.5</td>\n",
       "      <td>False</td>\n",
       "      <td>9</td>\n",
       "      <td>93</td>\n",
       "      <td>2020-09-03 13:29:30.773089+02:00</td>\n",
       "      <td>2020-09-03 13:29:30.773089+02:00</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1175</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>f1c4689bc47dee6f60b56d74b593dd46</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id  amount  is_full_price  representation_id  pricing_formula_id  \\\n",
       "0   10682     9.0          False                914                 114   \n",
       "1     478     9.5          False                273                 131   \n",
       "2   20873    11.5          False                275                 137   \n",
       "3  157142     8.0          False              82519                   9   \n",
       "4    1341     8.5          False                  9                  93   \n",
       "\n",
       "                         created_at                        updated_at  \\\n",
       "0  2020-09-03 14:09:43.119798+02:00  2020-09-03 14:09:43.119798+02:00   \n",
       "1  2020-09-03 13:21:22.711773+02:00  2020-09-03 13:21:22.711773+02:00   \n",
       "2  2020-09-03 14:46:33.589030+02:00  2020-09-03 14:46:33.589030+02:00   \n",
       "3  2022-01-28 19:29:23.525722+01:00  2022-01-28 19:29:23.525722+01:00   \n",
       "4  2020-09-03 13:29:30.773089+02:00  2020-09-03 13:29:30.773089+02:00   \n",
       "\n",
       "   category_id  apply_price  products_group_id  product_pack_id  extra_field  \\\n",
       "0           41          0.0              10655                1          NaN   \n",
       "1            1          0.0                471                1          NaN   \n",
       "2            1          0.0              20825                1          NaN   \n",
       "3            5          0.0             156773                1          NaN   \n",
       "4            1          0.0               1175                1          NaN   \n",
       "\n",
       "   amount_consumption                        identifier  \n",
       "0                 NaN  35c88f2db8a63d7474e46eb8ca9260e7  \n",
       "1                 NaN  8a179671ab198e570e6a104c4451379f  \n",
       "2                 NaN  ee83779ce29e67ad251e40234b426d6a  \n",
       "3                 NaN  d865383579314b791aa4bcf3fb418f17  \n",
       "4                 NaN  f1c4689bc47dee6f60b56d74b593dd46  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "products = display_databases(\"1products.csv\")\n",
    "print(\"Number of columns : \", len(products.columns))\n",
    "products.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of columns :  12\n",
      "Columns :  Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
      "       'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
      "       'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>representation_id</th>\n",
       "      <th>pricing_formula_id</th>\n",
       "      <th>category_id</th>\n",
       "      <th>products_group_id</th>\n",
       "      <th>product_pack_id</th>\n",
       "      <th>identifier</th>\n",
       "      <th>amount</th>\n",
       "      <th>is_full_price</th>\n",
       "      <th>apply_price</th>\n",
       "      <th>extra_field</th>\n",
       "      <th>amount_consumption</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10682</td>\n",
       "      <td>914</td>\n",
       "      <td>114</td>\n",
       "      <td>41</td>\n",
       "      <td>10655</td>\n",
       "      <td>1</td>\n",
       "      <td>35c88f2db8a63d7474e46eb8ca9260e7</td>\n",
       "      <td>9.0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>478</td>\n",
       "      <td>273</td>\n",
       "      <td>131</td>\n",
       "      <td>1</td>\n",
       "      <td>471</td>\n",
       "      <td>1</td>\n",
       "      <td>8a179671ab198e570e6a104c4451379f</td>\n",
       "      <td>9.5</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20873</td>\n",
       "      <td>275</td>\n",
       "      <td>137</td>\n",
       "      <td>1</td>\n",
       "      <td>20825</td>\n",
       "      <td>1</td>\n",
       "      <td>ee83779ce29e67ad251e40234b426d6a</td>\n",
       "      <td>11.5</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>157142</td>\n",
       "      <td>82519</td>\n",
       "      <td>9</td>\n",
       "      <td>5</td>\n",
       "      <td>156773</td>\n",
       "      <td>1</td>\n",
       "      <td>d865383579314b791aa4bcf3fb418f17</td>\n",
       "      <td>8.0</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1341</td>\n",
       "      <td>9</td>\n",
       "      <td>93</td>\n",
       "      <td>1</td>\n",
       "      <td>1175</td>\n",
       "      <td>1</td>\n",
       "      <td>f1c4689bc47dee6f60b56d74b593dd46</td>\n",
       "      <td>8.5</td>\n",
       "      <td>False</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id  representation_id  pricing_formula_id  category_id  \\\n",
       "0   10682                914                 114           41   \n",
       "1     478                273                 131            1   \n",
       "2   20873                275                 137            1   \n",
       "3  157142              82519                   9            5   \n",
       "4    1341                  9                  93            1   \n",
       "\n",
       "   products_group_id  product_pack_id                        identifier  \\\n",
       "0              10655                1  35c88f2db8a63d7474e46eb8ca9260e7   \n",
       "1                471                1  8a179671ab198e570e6a104c4451379f   \n",
       "2              20825                1  ee83779ce29e67ad251e40234b426d6a   \n",
       "3             156773                1  d865383579314b791aa4bcf3fb418f17   \n",
       "4               1175                1  f1c4689bc47dee6f60b56d74b593dd46   \n",
       "\n",
       "   amount  is_full_price  apply_price  extra_field  amount_consumption  \n",
       "0     9.0          False          0.0          NaN                 NaN  \n",
       "1     9.5          False          0.0          NaN                 NaN  \n",
       "2    11.5          False          0.0          NaN                 NaN  \n",
       "3     8.0          False          0.0          NaN                 NaN  \n",
       "4     8.5          False          0.0          NaN                 NaN  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "products = remove_horodates(products)\n",
    "print(\"Number of columns : \", len(products.columns))\n",
    "products = order_columns_id(products)\n",
    "print(\"Columns : \", products.columns)\n",
    "products.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a383474f-7da9-422c-bb69-3f0cc0b7053f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                      int64\n",
      "representation_id       int64\n",
      "pricing_formula_id      int64\n",
      "category_id             int64\n",
      "products_group_id       int64\n",
      "product_pack_id         int64\n",
      "identifier             object\n",
      "amount                float64\n",
      "is_full_price            bool\n",
      "apply_price           float64\n",
      "extra_field           float64\n",
      "amount_consumption    float64\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(products.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "460749ac-aa26-4216-8667-518546f72f72",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                      0.0\n",
      "representation_id       0.0\n",
      "pricing_formula_id      0.0\n",
      "category_id             0.0\n",
      "products_group_id       0.0\n",
      "product_pack_id         0.0\n",
      "identifier              0.0\n",
      "amount                  0.0\n",
      "is_full_price           0.0\n",
      "apply_price             0.0\n",
      "extra_field           100.0\n",
      "amount_consumption    100.0\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "percent_missing = products.isna().sum() * 100 / len(products)\n",
    "print(percent_missing)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebcb48ab-adad-42e5-b5d7-7275771cd200",
   "metadata": {},
   "source": [
    "#### Deep analysis of categories.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3efce2b6-2d2f-4da9-98ed-1aae17da624c",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1categories.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "38aa39fd-58af-4fb8-98f2-4269dbaf35de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  bdc2324-data/1/1categories.csv\n",
      "Shape :  (27, 7)\n",
      "Number of columns :  7\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "      <th>extra_field</th>\n",
       "      <th>quota</th>\n",
       "      <th>identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>30</td>\n",
       "      <td>en nb entrées gr</td>\n",
       "      <td>2020-09-03 13:21:20.019202+02:00</td>\n",
       "      <td>2020-09-03 13:21:20.019202+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>849ab2791a14f5fc2bb4d87ab2b78bf6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16</td>\n",
       "      <td>indiv activité enfant</td>\n",
       "      <td>2020-09-03 13:11:23.306968+02:00</td>\n",
       "      <td>2020-09-03 13:11:23.306968+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>425fd2f01984cc4ba030c1be98f42c33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>39</td>\n",
       "      <td>indiv activité gr</td>\n",
       "      <td>2020-09-03 13:21:20.029901+02:00</td>\n",
       "      <td>2020-09-03 13:21:20.029901+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9244dd3738788db0d22a5d0afe687b69</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1108</td>\n",
       "      <td>groupe forfait adulte</td>\n",
       "      <td>2020-09-19 02:06:43.145697+02:00</td>\n",
       "      <td>2020-09-19 02:06:43.145697+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3edda20c877a93b5ff883827238eb711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>groupe forfait entrées tr</td>\n",
       "      <td>2020-09-03 13:11:23.264997+02:00</td>\n",
       "      <td>2020-09-03 13:11:23.264997+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ff48df4b2dd5a14116bf4d280b31621e</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     id                       name                        created_at  \\\n",
       "0    30           en nb entrées gr  2020-09-03 13:21:20.019202+02:00   \n",
       "1    16      indiv activité enfant  2020-09-03 13:11:23.306968+02:00   \n",
       "2    39          indiv activité gr  2020-09-03 13:21:20.029901+02:00   \n",
       "3  1108      groupe forfait adulte  2020-09-19 02:06:43.145697+02:00   \n",
       "4     6  groupe forfait entrées tr  2020-09-03 13:11:23.264997+02:00   \n",
       "\n",
       "                         updated_at  extra_field  quota  \\\n",
       "0  2020-09-03 13:21:20.019202+02:00          NaN    NaN   \n",
       "1  2020-09-03 13:11:23.306968+02:00          NaN    NaN   \n",
       "2  2020-09-03 13:21:20.029901+02:00          NaN    NaN   \n",
       "3  2020-09-19 02:06:43.145697+02:00          NaN    NaN   \n",
       "4  2020-09-03 13:11:23.264997+02:00          NaN    NaN   \n",
       "\n",
       "                         identifier  \n",
       "0  849ab2791a14f5fc2bb4d87ab2b78bf6  \n",
       "1  425fd2f01984cc4ba030c1be98f42c33  \n",
       "2  9244dd3738788db0d22a5d0afe687b69  \n",
       "3  3edda20c877a93b5ff883827238eb711  \n",
       "4  ff48df4b2dd5a14116bf4d280b31621e  "
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "99eb6d14-8b4b-4d55-8fc7-ddf2726096f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of columns :  5\n",
      "Columns :  Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
      "Percent of NA for each column :  id               0.000000\n",
      "identifier       0.000000\n",
      "name             3.703704\n",
      "extra_field    100.000000\n",
      "quota          100.000000\n",
      "dtype: float64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>identifier</th>\n",
       "      <th>name</th>\n",
       "      <th>extra_field</th>\n",
       "      <th>quota</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>30</td>\n",
       "      <td>849ab2791a14f5fc2bb4d87ab2b78bf6</td>\n",
       "      <td>en nb entrées gr</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16</td>\n",
       "      <td>425fd2f01984cc4ba030c1be98f42c33</td>\n",
       "      <td>indiv activité enfant</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>39</td>\n",
       "      <td>9244dd3738788db0d22a5d0afe687b69</td>\n",
       "      <td>indiv activité gr</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1108</td>\n",
       "      <td>3edda20c877a93b5ff883827238eb711</td>\n",
       "      <td>groupe forfait adulte</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>6</td>\n",
       "      <td>ff48df4b2dd5a14116bf4d280b31621e</td>\n",
       "      <td>groupe forfait entrées tr</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     id                        identifier                       name  \\\n",
       "0    30  849ab2791a14f5fc2bb4d87ab2b78bf6           en nb entrées gr   \n",
       "1    16  425fd2f01984cc4ba030c1be98f42c33      indiv activité enfant   \n",
       "2    39  9244dd3738788db0d22a5d0afe687b69          indiv activité gr   \n",
       "3  1108  3edda20c877a93b5ff883827238eb711      groupe forfait adulte   \n",
       "4     6  ff48df4b2dd5a14116bf4d280b31621e  groupe forfait entrées tr   \n",
       "\n",
       "   extra_field  quota  \n",
       "0          NaN    NaN  \n",
       "1          NaN    NaN  \n",
       "2          NaN    NaN  \n",
       "3          NaN    NaN  \n",
       "4          NaN    NaN  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "c5f39cc9-dff8-452c-9a3e-9f7df81a8a19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id               int64\n",
       "identifier      object\n",
       "name            object\n",
       "extra_field    float64\n",
       "quota          float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4cb0b37-2262-45c0-97be-b12c503016e3",
   "metadata": {},
   "source": [
    "#### Deep analysis of type_of_categories.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b4a3af9-ed12-43ec-b17e-fd425b238265",
   "metadata": {},
   "source": [
    "#### Deep analysis of representation_category_capacities.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "135966fb-aab1-48d7-bb4c-39a53ee643ca",
   "metadata": {},
   "source": [
    "#### Deep analysis of representations.csv"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b480f39f-d5c7-4ded-8f64-ea8ac31f5db5",
   "metadata": {},
   "source": [
    "#### Deep analysis of events.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "2d52d6da-cca5-4abd-be05-2f00fd3eca8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1events.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "6cab507d-8b11-404d-9286-5cc205228af9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  bdc2324-data/1/1events.csv\n",
      "Shape :  (1232, 12)\n",
      "Number of columns :  12\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "      <th>season_id</th>\n",
       "      <th>facility_id</th>\n",
       "      <th>name</th>\n",
       "      <th>event_type_id</th>\n",
       "      <th>manual_added</th>\n",
       "      <th>is_display</th>\n",
       "      <th>event_type_key_id</th>\n",
       "      <th>facility_key_id</th>\n",
       "      <th>identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>192</td>\n",
       "      <td>2020-09-03 13:36:42.216991+02:00</td>\n",
       "      <td>2021-11-02 15:06:40.663219+01:00</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>frontières</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>c1cecd093146068fd57896e254e98170</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30329</td>\n",
       "      <td>2023-11-04 02:50:34.602462+01:00</td>\n",
       "      <td>2023-11-04 02:52:26.138154+01:00</td>\n",
       "      <td>2767</td>\n",
       "      <td>1</td>\n",
       "      <td>visite guidée une autre histoire du monde (1h00)</td>\n",
       "      <td>5</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>f510a6710878d7aca36e71c54abab525</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>161</td>\n",
       "      <td>2020-09-03 13:29:27.944002+02:00</td>\n",
       "      <td>2021-11-02 15:06:40.652026+01:00</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>visite contée les chercheurs d'or indiv</td>\n",
       "      <td>2</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>21177fa9acad1ae2b1f595690fb853d3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5957</td>\n",
       "      <td>2021-07-31 11:16:42.575583+02:00</td>\n",
       "      <td>2021-11-02 15:06:40.663219+01:00</td>\n",
       "      <td>582</td>\n",
       "      <td>1</td>\n",
       "      <td>we dreamt of utopia and we woke up screaming.</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>962601f1eb153d45d49437f8fe839f7f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8337</td>\n",
       "      <td>2021-08-17 13:40:34.111923+02:00</td>\n",
       "      <td>2021-11-02 15:06:40.663219+01:00</td>\n",
       "      <td>582</td>\n",
       "      <td>1</td>\n",
       "      <td>jeff koons épisodes 4</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>bfa22f5a2364a2dacfc45cca1c8d3215</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id                        created_at                        updated_at  \\\n",
       "0    192  2020-09-03 13:36:42.216991+02:00  2021-11-02 15:06:40.663219+01:00   \n",
       "1  30329  2023-11-04 02:50:34.602462+01:00  2023-11-04 02:52:26.138154+01:00   \n",
       "2    161  2020-09-03 13:29:27.944002+02:00  2021-11-02 15:06:40.652026+01:00   \n",
       "3   5957  2021-07-31 11:16:42.575583+02:00  2021-11-02 15:06:40.663219+01:00   \n",
       "4   8337  2021-08-17 13:40:34.111923+02:00  2021-11-02 15:06:40.663219+01:00   \n",
       "\n",
       "   season_id  facility_id                                              name  \\\n",
       "0         16            1                                        frontières   \n",
       "1       2767            1  visite guidée une autre histoire du monde (1h00)   \n",
       "2         16            1           visite contée les chercheurs d'or indiv   \n",
       "3        582            1     we dreamt of utopia and we woke up screaming.   \n",
       "4        582            1                             jeff koons épisodes 4   \n",
       "\n",
       "   event_type_id  manual_added  is_display  event_type_key_id  \\\n",
       "0              4         False        True                  4   \n",
       "1              5         False        True                  5   \n",
       "2              2         False        True                  2   \n",
       "3              4         False        True                  4   \n",
       "4              4         False        True                  4   \n",
       "\n",
       "   facility_key_id                        identifier  \n",
       "0                1  c1cecd093146068fd57896e254e98170  \n",
       "1                1  f510a6710878d7aca36e71c54abab525  \n",
       "2                1  21177fa9acad1ae2b1f595690fb853d3  \n",
       "3                1  962601f1eb153d45d49437f8fe839f7f  \n",
       "4                1  bfa22f5a2364a2dacfc45cca1c8d3215  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "9fe57873-8108-44c9-b8a5-f58d3cbb6d17",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of columns :  10\n",
      "Columns :  Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
      "       'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
      "      dtype='object')\n",
      "Percent of NA for each column :  id                   0.000000\n",
      "season_id            0.000000\n",
      "facility_id          0.000000\n",
      "event_type_id        0.000000\n",
      "event_type_key_id    0.000000\n",
      "facility_key_id      0.000000\n",
      "identifier           0.000000\n",
      "name                 0.974026\n",
      "manual_added         0.000000\n",
      "is_display           0.000000\n",
      "dtype: float64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>season_id</th>\n",
       "      <th>facility_id</th>\n",
       "      <th>event_type_id</th>\n",
       "      <th>event_type_key_id</th>\n",
       "      <th>facility_key_id</th>\n",
       "      <th>identifier</th>\n",
       "      <th>name</th>\n",
       "      <th>manual_added</th>\n",
       "      <th>is_display</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>192</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>c1cecd093146068fd57896e254e98170</td>\n",
       "      <td>frontières</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30329</td>\n",
       "      <td>2767</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>f510a6710878d7aca36e71c54abab525</td>\n",
       "      <td>visite guidée une autre histoire du monde (1h00)</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>161</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>21177fa9acad1ae2b1f595690fb853d3</td>\n",
       "      <td>visite contée les chercheurs d'or indiv</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5957</td>\n",
       "      <td>582</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>962601f1eb153d45d49437f8fe839f7f</td>\n",
       "      <td>we dreamt of utopia and we woke up screaming.</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8337</td>\n",
       "      <td>582</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>bfa22f5a2364a2dacfc45cca1c8d3215</td>\n",
       "      <td>jeff koons épisodes 4</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id  season_id  facility_id  event_type_id  event_type_key_id  \\\n",
       "0    192         16            1              4                  4   \n",
       "1  30329       2767            1              5                  5   \n",
       "2    161         16            1              2                  2   \n",
       "3   5957        582            1              4                  4   \n",
       "4   8337        582            1              4                  4   \n",
       "\n",
       "   facility_key_id                        identifier  \\\n",
       "0                1  c1cecd093146068fd57896e254e98170   \n",
       "1                1  f510a6710878d7aca36e71c54abab525   \n",
       "2                1  21177fa9acad1ae2b1f595690fb853d3   \n",
       "3                1  962601f1eb153d45d49437f8fe839f7f   \n",
       "4                1  bfa22f5a2364a2dacfc45cca1c8d3215   \n",
       "\n",
       "                                               name  manual_added  is_display  \n",
       "0                                        frontières         False        True  \n",
       "1  visite guidée une autre histoire du monde (1h00)         False        True  \n",
       "2           visite contée les chercheurs d'or indiv         False        True  \n",
       "3     we dreamt of utopia and we woke up screaming.         False        True  \n",
       "4                             jeff koons épisodes 4         False        True  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "7fd9e5bd-baac-4b3b-9ffb-5a9baa18399b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                    int64\n",
       "season_id             int64\n",
       "facility_id           int64\n",
       "event_type_id         int64\n",
       "event_type_key_id     int64\n",
       "facility_key_id       int64\n",
       "identifier           object\n",
       "name                 object\n",
       "manual_added           bool\n",
       "is_display             bool\n",
       "dtype: object"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24186efa-5908-4b03-bf52-96415fc8bd54",
   "metadata": {},
   "source": [
    "#### Deep analysis of event_types.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "90ab62d4-a086-4469-961c-67eefb375388",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1event_types.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "58db1751-fd56-4c28-b49e-bc8235bb0dc8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  bdc2324-data/1/1event_types.csv\n",
      "Shape :  (9, 6)\n",
      "Number of columns :  6\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "      <th>fidelity_delay</th>\n",
       "      <th>identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>standard</td>\n",
       "      <td>2020-09-03 12:24:22.574262+02:00</td>\n",
       "      <td>2020-09-03 12:24:22.574262+02:00</td>\n",
       "      <td>36</td>\n",
       "      <td>c00f0c4675b91fb8b918e4079a0b1bac</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>66</td>\n",
       "      <td>package</td>\n",
       "      <td>2020-09-03 14:05:04.648137+02:00</td>\n",
       "      <td>2020-09-03 14:05:04.648137+02:00</td>\n",
       "      <td>36</td>\n",
       "      <td>efe90a8e604a7c840e88d03a67f6b7d8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>83</td>\n",
       "      <td>guide multimédias</td>\n",
       "      <td>2020-09-03 14:15:17.252539+02:00</td>\n",
       "      <td>2020-09-03 14:15:17.252539+02:00</td>\n",
       "      <td>36</td>\n",
       "      <td>ee14c62b3b9f6c7dd5401685a18e4460</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>non défini</td>\n",
       "      <td>2020-09-03 13:11:23.117024+02:00</td>\n",
       "      <td>2020-09-03 13:11:23.117024+02:00</td>\n",
       "      <td>36</td>\n",
       "      <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2723</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2021-12-22 09:45:47.715105+01:00</td>\n",
       "      <td>2021-12-22 09:45:47.715105+01:00</td>\n",
       "      <td>36</td>\n",
       "      <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     id               name                        created_at  \\\n",
       "0     1           standard  2020-09-03 12:24:22.574262+02:00   \n",
       "1    66            package  2020-09-03 14:05:04.648137+02:00   \n",
       "2    83  guide multimédias  2020-09-03 14:15:17.252539+02:00   \n",
       "3     3         non défini  2020-09-03 13:11:23.117024+02:00   \n",
       "4  2723                NaN  2021-12-22 09:45:47.715105+01:00   \n",
       "\n",
       "                         updated_at  fidelity_delay  \\\n",
       "0  2020-09-03 12:24:22.574262+02:00              36   \n",
       "1  2020-09-03 14:05:04.648137+02:00              36   \n",
       "2  2020-09-03 14:15:17.252539+02:00              36   \n",
       "3  2020-09-03 13:11:23.117024+02:00              36   \n",
       "4  2021-12-22 09:45:47.715105+01:00              36   \n",
       "\n",
       "                         identifier  \n",
       "0  c00f0c4675b91fb8b918e4079a0b1bac  \n",
       "1  efe90a8e604a7c840e88d03a67f6b7d8  \n",
       "2  ee14c62b3b9f6c7dd5401685a18e4460  \n",
       "3  52ff3466787b4d538407372e5f7afe0f  \n",
       "4  d41d8cd98f00b204e9800998ecf8427e  "
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "ac93382c-0b5f-462d-8021-0dd1e7201b8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of columns :  4\n",
      "Columns :  Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
      "Percent of NA for each column :  id                 0.000000\n",
      "fidelity_delay     0.000000\n",
      "identifier         0.000000\n",
      "name              11.111111\n",
      "dtype: float64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>fidelity_delay</th>\n",
       "      <th>identifier</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>36</td>\n",
       "      <td>c00f0c4675b91fb8b918e4079a0b1bac</td>\n",
       "      <td>standard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>66</td>\n",
       "      <td>36</td>\n",
       "      <td>efe90a8e604a7c840e88d03a67f6b7d8</td>\n",
       "      <td>package</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>83</td>\n",
       "      <td>36</td>\n",
       "      <td>ee14c62b3b9f6c7dd5401685a18e4460</td>\n",
       "      <td>guide multimédias</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>36</td>\n",
       "      <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
       "      <td>non défini</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2723</td>\n",
       "      <td>36</td>\n",
       "      <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     id  fidelity_delay                        identifier               name\n",
       "0     1              36  c00f0c4675b91fb8b918e4079a0b1bac           standard\n",
       "1    66              36  efe90a8e604a7c840e88d03a67f6b7d8            package\n",
       "2    83              36  ee14c62b3b9f6c7dd5401685a18e4460  guide multimédias\n",
       "3     3              36  52ff3466787b4d538407372e5f7afe0f         non défini\n",
       "4  2723              36  d41d8cd98f00b204e9800998ecf8427e                NaN"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "18cbd630-3c7d-49e1-932b-9460badf3758",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                 int64\n",
       "fidelity_delay     int64\n",
       "identifier        object\n",
       "name              object\n",
       "dtype: object"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5847a441-31b9-4802-a5ae-90d8c6d6e153",
   "metadata": {},
   "source": [
    "#### Deep analysis of seasons.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "ae544dcc-f23d-4216-bb5b-597cc1b3765e",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1seasons.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "1ac97963-9208-4329-be41-d71a5797487f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  bdc2324-data/1/1seasons.csv\n",
      "Shape :  (13, 6)\n",
      "Number of columns :  6\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "      <th>start_date_time</th>\n",
       "      <th>identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>943</td>\n",
       "      <td>2013</td>\n",
       "      <td>2021-07-29 08:55:33.282607+02:00</td>\n",
       "      <td>2021-07-29 08:55:33.282607+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8038da89e49ac5eabb489cfc6cea9fc1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>129</td>\n",
       "      <td>2014</td>\n",
       "      <td>2020-09-03 15:13:08.105567+02:00</td>\n",
       "      <td>2020-09-03 15:13:08.105567+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>cee8d6b7ce52554fd70354e37bbf44a2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>2015</td>\n",
       "      <td>2020-09-03 13:11:19.405037+02:00</td>\n",
       "      <td>2020-09-03 13:11:19.405037+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65d2ea03425887a717c435081cfc5dbb</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>2016</td>\n",
       "      <td>2020-09-03 13:11:19.401001+02:00</td>\n",
       "      <td>2020-09-03 13:11:19.401001+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>95192c98732387165bf8e396c0f2dad2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>2017</td>\n",
       "      <td>2020-09-03 13:11:19.409005+02:00</td>\n",
       "      <td>2020-09-03 13:11:19.409005+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8d8818c8e140c64c743113f563cf750f</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id  name                        created_at  \\\n",
       "0  943  2013  2021-07-29 08:55:33.282607+02:00   \n",
       "1  129  2014  2020-09-03 15:13:08.105567+02:00   \n",
       "2    3  2015  2020-09-03 13:11:19.405037+02:00   \n",
       "3    2  2016  2020-09-03 13:11:19.401001+02:00   \n",
       "4    4  2017  2020-09-03 13:11:19.409005+02:00   \n",
       "\n",
       "                         updated_at  start_date_time  \\\n",
       "0  2021-07-29 08:55:33.282607+02:00              NaN   \n",
       "1  2020-09-03 15:13:08.105567+02:00              NaN   \n",
       "2  2020-09-03 13:11:19.405037+02:00              NaN   \n",
       "3  2020-09-03 13:11:19.401001+02:00              NaN   \n",
       "4  2020-09-03 13:11:19.409005+02:00              NaN   \n",
       "\n",
       "                         identifier  \n",
       "0  8038da89e49ac5eabb489cfc6cea9fc1  \n",
       "1  cee8d6b7ce52554fd70354e37bbf44a2  \n",
       "2  65d2ea03425887a717c435081cfc5dbb  \n",
       "3  95192c98732387165bf8e396c0f2dad2  \n",
       "4  8d8818c8e140c64c743113f563cf750f  "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "b4593d46-105c-47dd-aa71-babd8e63e65b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of columns :  4\n",
      "Columns :  Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
      "Percent of NA for each column :  id                   0.000000\n",
      "identifier           0.000000\n",
      "name                 7.692308\n",
      "start_date_time    100.000000\n",
      "dtype: float64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>identifier</th>\n",
       "      <th>name</th>\n",
       "      <th>start_date_time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>943</td>\n",
       "      <td>8038da89e49ac5eabb489cfc6cea9fc1</td>\n",
       "      <td>2013</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>129</td>\n",
       "      <td>cee8d6b7ce52554fd70354e37bbf44a2</td>\n",
       "      <td>2014</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>65d2ea03425887a717c435081cfc5dbb</td>\n",
       "      <td>2015</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>95192c98732387165bf8e396c0f2dad2</td>\n",
       "      <td>2016</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>8d8818c8e140c64c743113f563cf750f</td>\n",
       "      <td>2017</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id                        identifier  name  start_date_time\n",
       "0  943  8038da89e49ac5eabb489cfc6cea9fc1  2013              NaN\n",
       "1  129  cee8d6b7ce52554fd70354e37bbf44a2  2014              NaN\n",
       "2    3  65d2ea03425887a717c435081cfc5dbb  2015              NaN\n",
       "3    2  95192c98732387165bf8e396c0f2dad2  2016              NaN\n",
       "4    4  8d8818c8e140c64c743113f563cf750f  2017              NaN"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "5d3b096d-8e73-4514-94e5-f2dcd4d0a89c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                   int64\n",
       "identifier          object\n",
       "name                object\n",
       "start_date_time    float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7b00bc7-eae6-457c-ac68-a4a55a6d1c8c",
   "metadata": {},
   "source": [
    "#### Deep Analysis of facilities.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "d95ef015-d44c-4353-8761-771b910d21c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "name_dataset = '1facilities.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "ef5fe794-8df7-4f27-8554-ecdc4074ac0b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  bdc2324-data/1/1facilities.csv\n",
      "Shape :  (2, 7)\n",
      "Number of columns :  7\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "      <th>street_id</th>\n",
       "      <th>fixed_capacity</th>\n",
       "      <th>identifier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>non défini</td>\n",
       "      <td>2020-09-03 13:16:35.293111+02:00</td>\n",
       "      <td>2020-09-03 13:16:35.293111+02:00</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>mucem</td>\n",
       "      <td>2020-09-03 13:11:23.133059+02:00</td>\n",
       "      <td>2020-09-03 13:11:23.133059+02:00</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>702bd76fe3dd5dbcf118a6965a946f54</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id        name                        created_at  \\\n",
       "0   2  non défini  2020-09-03 13:16:35.293111+02:00   \n",
       "1   1       mucem  2020-09-03 13:11:23.133059+02:00   \n",
       "\n",
       "                         updated_at  street_id  fixed_capacity  \\\n",
       "0  2020-09-03 13:16:35.293111+02:00          2             NaN   \n",
       "1  2020-09-03 13:11:23.133059+02:00          1             NaN   \n",
       "\n",
       "                         identifier  \n",
       "0  52ff3466787b4d538407372e5f7afe0f  \n",
       "1  702bd76fe3dd5dbcf118a6965a946f54  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = display_databases(name_dataset)\n",
    "print(\"Number of columns : \", len(df.columns))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "e3621201-fab9-49fd-95c1-0b9d5da76e50",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of columns :  5\n",
      "Columns :  Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n",
      "Percent of NA for each column :  id                  0.0\n",
      "street_id           0.0\n",
      "identifier          0.0\n",
      "name                0.0\n",
      "fixed_capacity    100.0\n",
      "dtype: float64\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>street_id</th>\n",
       "      <th>identifier</th>\n",
       "      <th>name</th>\n",
       "      <th>fixed_capacity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
       "      <td>non défini</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>702bd76fe3dd5dbcf118a6965a946f54</td>\n",
       "      <td>mucem</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  street_id                        identifier        name  fixed_capacity\n",
       "0   2          2  52ff3466787b4d538407372e5f7afe0f  non défini             NaN\n",
       "1   1          1  702bd76fe3dd5dbcf118a6965a946f54       mucem             NaN"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = process_df(df)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "1b198b92-8654-4531-a0dd-8f2e01c2e6c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "id                  int64\n",
       "street_id           int64\n",
       "identifier         object\n",
       "name               object\n",
       "fixed_capacity    float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}