2723 lines
		
	
	
		
			81 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			2723 lines
		
	
	
		
			81 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
{
 | 
						|
 "cells": [
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "455cc769-1b3b-4fef-b395-e74a988ceed3",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "## Notebook Alexis"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 5,
 | 
						|
   "id": "20eeb149-6618-4ef2-9cfd-ff062950f36c",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "import pandas as pd\n",
 | 
						|
    "import os\n",
 | 
						|
    "import s3fs"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 6,
 | 
						|
   "id": "30494c5e-9649-4fff-8708-617544188b20",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "['bdc2324-data/1',\n",
 | 
						|
       " 'bdc2324-data/10',\n",
 | 
						|
       " 'bdc2324-data/101',\n",
 | 
						|
       " 'bdc2324-data/11',\n",
 | 
						|
       " 'bdc2324-data/12',\n",
 | 
						|
       " 'bdc2324-data/13',\n",
 | 
						|
       " 'bdc2324-data/14',\n",
 | 
						|
       " 'bdc2324-data/2',\n",
 | 
						|
       " 'bdc2324-data/3',\n",
 | 
						|
       " 'bdc2324-data/4',\n",
 | 
						|
       " 'bdc2324-data/5',\n",
 | 
						|
       " 'bdc2324-data/6',\n",
 | 
						|
       " 'bdc2324-data/7',\n",
 | 
						|
       " 'bdc2324-data/8',\n",
 | 
						|
       " 'bdc2324-data/9']"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 6,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "# Create filesystem object\n",
 | 
						|
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
 | 
						|
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
 | 
						|
    "\n",
 | 
						|
    "BUCKET = \"bdc2324-data\"\n",
 | 
						|
    "fs.ls(BUCKET)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "2feffee9-9f23-4caa-8a01-9e4a93abbf5d",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "###  I. Analyse fichier 8"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "f54ba449-2051-4acd-939d-d30abd5452fe",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "This section describes the databases associated with company 8. "
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 7,
 | 
						|
   "id": "f1cce705-46e1-42de-8e93-2ee15312d288",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "directory_path = '8'"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 8,
 | 
						|
   "id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "bdc2324-data/8/8campaign_stats.csv\n",
 | 
						|
      "bdc2324-data/8/8campaigns.csv\n",
 | 
						|
      "bdc2324-data/8/8categories.csv\n",
 | 
						|
      "bdc2324-data/8/8countries.csv\n",
 | 
						|
      "bdc2324-data/8/8currencies.csv\n",
 | 
						|
      "bdc2324-data/8/8customer_target_mappings.csv\n",
 | 
						|
      "bdc2324-data/8/8customersplus.csv\n",
 | 
						|
      "bdc2324-data/8/8event_types.csv\n",
 | 
						|
      "bdc2324-data/8/8events.csv\n",
 | 
						|
      "bdc2324-data/8/8facilities.csv\n",
 | 
						|
      "bdc2324-data/8/8link_stats.csv\n",
 | 
						|
      "bdc2324-data/8/8pricing_formulas.csv\n",
 | 
						|
      "bdc2324-data/8/8product_packs.csv\n",
 | 
						|
      "bdc2324-data/8/8products.csv\n",
 | 
						|
      "bdc2324-data/8/8products_groups.csv\n",
 | 
						|
      "bdc2324-data/8/8purchases.csv\n",
 | 
						|
      "bdc2324-data/8/8representation_category_capacities.csv\n",
 | 
						|
      "bdc2324-data/8/8representations.csv\n",
 | 
						|
      "bdc2324-data/8/8seasons.csv\n",
 | 
						|
      "bdc2324-data/8/8suppliers.csv\n",
 | 
						|
      "bdc2324-data/8/8target_types.csv\n",
 | 
						|
      "bdc2324-data/8/8targets.csv\n",
 | 
						|
      "bdc2324-data/8/8tickets.csv\n",
 | 
						|
      "bdc2324-data/8/8type_of_categories.csv\n",
 | 
						|
      "bdc2324-data/8/8type_of_pricing_formulas.csv\n",
 | 
						|
      "bdc2324-data/8/8type_ofs.csv\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "# check the files in the directory\n",
 | 
						|
    "\n",
 | 
						|
    "objects = fs.ls(f'{BUCKET}/{directory_path}')\n",
 | 
						|
    "\n",
 | 
						|
    "for file in objects:\n",
 | 
						|
    "    print(file)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 9,
 | 
						|
   "id": "65cb38ad-52ae-4266-85d8-c47d81b00283",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "def display_databases(file_name):\n",
 | 
						|
    "    \"\"\"\n",
 | 
						|
    "    This function returns the file from s3 storage\n",
 | 
						|
    "    \"\"\"\n",
 | 
						|
    "    file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
 | 
						|
    "    print(\"File path : \", file_path)\n",
 | 
						|
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
 | 
						|
    "        df = pd.read_csv(file_in, sep=\",\")\n",
 | 
						|
    "        \n",
 | 
						|
    "    print(\"Shape : \", df.shape)\n",
 | 
						|
    "    return df\n",
 | 
						|
    "    "
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "ddd545ef-7e9f-4696-962a-115294991641",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Lookt at campaigns files"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "0214d30d-5f83-498f-867f-e67b5793b731",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "campaigns = display_databases(\"8campaigns.csv\")\n",
 | 
						|
    "campaigns.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "e7982be4-2c42-4a91-be5a-329a999644cc",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "campaign_stats = display_databases(\"8campaign_stats.csv\")\n",
 | 
						|
    "campaign_stats.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "e6512bc9-91f5-4fe4-a637-a4e84dc497a9",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Look at links files"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "28e7c1fe-470f-4d84-87b8-a711a973500b",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "There is no links file for these company. Only the link_stats file"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "e973575b-4ed6-4b23-8024-f383ac82e87c",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "links_stats = display_databases(\"8link_stats.csv\")\n",
 | 
						|
    "links_stats.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "8dfcca1f-1323-413f-aa8d-3ee5ce2610a8",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyse Customersplus file"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "3b523575-c779-451c-a12e-a36fb4ad232c",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "file_name = \"8customersplus.csv\"\n",
 | 
						|
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
 | 
						|
    "print(file_path)\n",
 | 
						|
    "with fs.open(file_path, mode=\"rb\") as file_in:\n",
 | 
						|
    "    customersplus = pd.read_csv(file_in, sep=\",\")\n",
 | 
						|
    "\n",
 | 
						|
    "customersplus.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "fe56785a-ed3c-4322-aafa-a630f97b836f",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyse Structures files"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "87d801fc-d19a-4c45-9b21-9b6d7a8451fd",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "file_name = \"8structures.csv\"\n",
 | 
						|
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
 | 
						|
    "print(file_path)\n",
 | 
						|
    "try:\n",
 | 
						|
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
 | 
						|
    "        structures = pd.read_csv(file_in, sep=\",\")\n",
 | 
						|
    "except:\n",
 | 
						|
    "    print(\"No structures database\")"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "b8452558-2d32-459b-91e7-f6042345e465",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "For Stade Français, there is no structures, tags and structure_tag_mapping databases"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "285b1422-9ca9-4afd-b752-777a54aaa677",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyze Target databases"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "b6e4c3ea-5ccf-4aec-bd2d-79a5a1194178",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "file_name = \"8customer_target_mappings.csv\"\n",
 | 
						|
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
 | 
						|
    "print(file_path)\n",
 | 
						|
    "try:\n",
 | 
						|
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
 | 
						|
    "        customer_targets = pd.read_csv(file_in, sep=\",\")\n",
 | 
						|
    "        \n",
 | 
						|
    "except:\n",
 | 
						|
    "    print(\"No such database in s3\")\n",
 | 
						|
    "\n",
 | 
						|
    "print(\"Shape : \", customer_targets.shape)\n",
 | 
						|
    "customer_targets.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "6e81a35c-3c6f-403d-9ebd-e8399ecd4263",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "file_name = \"8targets.csv\"\n",
 | 
						|
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
 | 
						|
    "print(file_path)\n",
 | 
						|
    "try:\n",
 | 
						|
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
 | 
						|
    "        targets = pd.read_csv(file_in, sep=\",\")\n",
 | 
						|
    "        \n",
 | 
						|
    "except:\n",
 | 
						|
    "    print(\"No such database in s3\")\n",
 | 
						|
    "\n",
 | 
						|
    "print(\"Shape : \", targets.shape)\n",
 | 
						|
    "targets.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "85696d74-3b2f-4368-9045-44db5322b60d",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "file_name = \"8target_types.csv\"\n",
 | 
						|
    "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
 | 
						|
    "print(file_path)\n",
 | 
						|
    "try:\n",
 | 
						|
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
 | 
						|
    "        target_types = pd.read_csv(file_in, sep=\",\")\n",
 | 
						|
    "        \n",
 | 
						|
    "except:\n",
 | 
						|
    "    print(\"No such database in s3\")\n",
 | 
						|
    "\n",
 | 
						|
    "print(\"Shape : \", target_types.shape)\n",
 | 
						|
    "target_types.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "cdc6416b-3deb-446c-8957-435745b93533",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyze consumption files"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "f8622bd5-a5ab-403f-ab01-758aec879ee4",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv\n",
 | 
						|
    "\n",
 | 
						|
    "However, there is no consumptions.csv file"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "7c57529b-2ffb-4039-9795-b27c6fbd54a4",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "purchases = display_databases(\"8purchases.csv\")\n",
 | 
						|
    "purchases.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "903321fb-99f8-475d-b4a6-c70ec2efe190",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "tickets = display_databases(\"8tickets.csv\")\n",
 | 
						|
    "tickets.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "243e6942-0233-4cd5-b32b-e005457131d2",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "suppliers = display_databases(\"8suppliers.csv\")\n",
 | 
						|
    "suppliers.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "fd8c876a-f0c5-4123-a422-c267af5f29b1",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyse product file"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "6b82efce-1dee-4d89-8585-28c4ad477eef",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "products = display_databases(\"8products.csv\")\n",
 | 
						|
    "products.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "8ad143b2-2869-4bd2-982e-688498b98727",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyze pricing files"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "9a54e9a5-801d-4000-9e76-e792edbf7e41",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "Meaning pricing_formulas.csv and type_of_pricing_formulas"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "daf37bff-a26d-4ff5-ad50-c90f917164bd",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "pricing_formulas = display_databases(\"8pricing_formulas.csv\")\n",
 | 
						|
    "pricing_formulas.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "cdb14488-b093-4b39-84fa-1c2b4576208f",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "type_pricing_formulas = display_databases(\"8type_of_pricing_formulas.csv\")\n",
 | 
						|
    "type_pricing_formulas.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "a084297a-4fd7-4cda-b513-7704f4244a5c",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyze type of products"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "76a67ea7-8720-441e-8973-23e5d105370e",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "Meaning categories.csv, type_of_categories.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "6582694d-5339-4f33-a943-c73033121a90",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "categories = display_databases(\"8categories.csv\")\n",
 | 
						|
    "categories.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "589076df-1958-42de-9941-1aff9fa8536f",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "type_categories = display_databases(\"8type_of_categories.csv\")\n",
 | 
						|
    "type_categories.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "3427b681-4c05-4e4e-9c2b-867ee789f98c",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyze type of representations"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "9381e36b-090a-44c5-a29d-3ac4c9a4431e",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "Meaning representation_category_capacities.csv, representations.csv, representations_types.csv\n",
 | 
						|
    "\n",
 | 
						|
    "however there is no representation_types database"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "6f06d72a-5725-4eee-8e4c-e9ef5820f346",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "representation_category_capacities = display_databases(\"8representation_category_capacities.csv\")\n",
 | 
						|
    "representation_category_capacities.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "bd405913-033d-4f15-a5b9-103d577baaff",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "representations = display_databases(\"8representations.csv\")\n",
 | 
						|
    "representations.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "0f2c7ea3-6964-48fd-9411-17547b2c3a3f",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "#representation_type = display_databases(\"8representation_types.csv\")"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "a9b02406-2a69-4431-8d49-3c6bd6a5e1c7",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyze type of events"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "1d554266-282c-4f64-9a0f-ddcf591ec912",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "Meaning events.csv, event_types.csv, seasons.csv and facilities.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "cba22ee2-338d-4ce1-a1e8-829a11a94bcf",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "events = display_databases(\"8events.csv\")\n",
 | 
						|
    "events.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "3db00b9d-2187-4cb6-980d-8ac6ab9eb460",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "event_types = display_databases(\"8event_types.csv\")\n",
 | 
						|
    "event_types.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "cba0ee58-6280-45fe-99b3-0be09db5922b",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "seasons = display_databases(\"8seasons.csv\")\n",
 | 
						|
    "seasons.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "6fa82fd7-d6d3-4857-af24-ea573b1129d0",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "facilities = display_databases(\"8facilities.csv\")\n",
 | 
						|
    "facilities.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "c7467d41-0ded-465d-bb08-15be914a166b",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Analyze annexe databases"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "17e9e334-0ae4-48d8-bed5-b50b4af49d5b",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "d3ec1040-48b2-40bb-8947-920ddb4589f3",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "## II. Identify Commons Datasets"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "ec528a8a-df38-48e2-a1be-4a1459a80a1e",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 18,
 | 
						|
   "id": "c240b811-48a6-4501-9e70-bc51d69e3ac4",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "## We first construct a dictionary reporting all the datasets for each companies\n",
 | 
						|
    "\n",
 | 
						|
    "companies = fs.ls(BUCKET)\n",
 | 
						|
    "companies_database = {}\n",
 | 
						|
    "\n",
 | 
						|
    "for company in companies:\n",
 | 
						|
    "    companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 24,
 | 
						|
   "id": "54057367-9df9-42f4-aa07-bf524bb76462",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Number of databases :  30\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "# Then we create a list of all database\n",
 | 
						|
    "\n",
 | 
						|
    "all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n",
 | 
						|
    "print(\"Number of databases : \",len(all_database))"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 39,
 | 
						|
   "id": "63914e20-9efc-4088-877b-edab5f225d00",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "30\n",
 | 
						|
      "23\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "## We then create a set of database in common for all companies\n",
 | 
						|
    "\n",
 | 
						|
    "data_in_common = set(all_database)\n",
 | 
						|
    "\n",
 | 
						|
    "print(len(data_in_common))\n",
 | 
						|
    "\n",
 | 
						|
    "for key in companies_database:\n",
 | 
						|
    "    diff_database = data_in_common.symmetric_difference(companies_database[key])\n",
 | 
						|
    "    data_in_common = data_in_common - diff_database\n",
 | 
						|
    "\n",
 | 
						|
    "print(len(data_in_common))\n",
 | 
						|
    "    "
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "676d8536-7d8c-4075-a357-b8d06e501ca8",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "## Create Universal database"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "7e460fbe-5067-4998-a1a8-9e3d07401750",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.\n",
 | 
						|
    "\n",
 | 
						|
    "Let's first create our procedure for the company 1 and the datasets belongings to the theme producst"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 10,
 | 
						|
   "id": "590a132a-4f57-4ea3-a282-2ef913e4b753",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "directory_path = '1'"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 11,
 | 
						|
   "id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "theme_products = [\"products.csv\" ,\"categories.csv\", \"type_of_categories.csv\"]"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 12,
 | 
						|
   "id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "def remove_horodates(df):\n",
 | 
						|
    "    \"\"\"\n",
 | 
						|
    "    this function remove horodate columns like created_at and updated_at\n",
 | 
						|
    "    \"\"\"\n",
 | 
						|
    "    df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
 | 
						|
    "    return df"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 13,
 | 
						|
   "id": "2c478213-09ae-44ef-8c7c-125bcb571642",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "def order_columns_id(df):\n",
 | 
						|
    "    \"\"\"\n",
 | 
						|
    "    this function puts all id columns at the beginning in order to read the dataset easier\n",
 | 
						|
    "    \"\"\"\n",
 | 
						|
    "    substring = 'id'\n",
 | 
						|
    "    id_columns = [col for col in df.columns if substring in col]\n",
 | 
						|
    "    remaining_col = [col for col in df.columns if substring not in col]\n",
 | 
						|
    "    new_order = id_columns + remaining_col\n",
 | 
						|
    "    return df[new_order]"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 14,
 | 
						|
   "id": "327e44b0-eb99-4022-b4ca-79548072f0f0",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "def percent_na(df):\n",
 | 
						|
    "    \"\"\"\n",
 | 
						|
    "    this function returns the percentage of na for each column\n",
 | 
						|
    "    \"\"\"\n",
 | 
						|
    "    percent_missing = df.isna().sum() * 100 / len(df)\n",
 | 
						|
    "    return percent_missing"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 25,
 | 
						|
   "id": "10926def-267f-4e86-b2c9-72e27ff9a9df",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "def process_df(df):\n",
 | 
						|
    "    df = remove_horodates(df)\n",
 | 
						|
    "    print(\"Number of columns : \", len(df.columns))\n",
 | 
						|
    "    df = order_columns_id(df)\n",
 | 
						|
    "    print(\"Columns : \", df.columns)\n",
 | 
						|
    "    print(\"Percent of NA for each column : \", percent_na(df))\n",
 | 
						|
    "    return df"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "98ac02cb-5295-47ca-99c6-99e622c5f388",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep analysis of products.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 15,
 | 
						|
   "id": "862a7658-0602-4d94-bb58-d23774c00d32",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "File path :  bdc2324-data/1/1products.csv\n",
 | 
						|
      "Shape :  (94803, 14)\n",
 | 
						|
      "Number of columns :  14\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>amount</th>\n",
 | 
						|
       "      <th>is_full_price</th>\n",
 | 
						|
       "      <th>representation_id</th>\n",
 | 
						|
       "      <th>pricing_formula_id</th>\n",
 | 
						|
       "      <th>created_at</th>\n",
 | 
						|
       "      <th>updated_at</th>\n",
 | 
						|
       "      <th>category_id</th>\n",
 | 
						|
       "      <th>apply_price</th>\n",
 | 
						|
       "      <th>products_group_id</th>\n",
 | 
						|
       "      <th>product_pack_id</th>\n",
 | 
						|
       "      <th>extra_field</th>\n",
 | 
						|
       "      <th>amount_consumption</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>10682</td>\n",
 | 
						|
       "      <td>9.0</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>914</td>\n",
 | 
						|
       "      <td>114</td>\n",
 | 
						|
       "      <td>2020-09-03 14:09:43.119798+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 14:09:43.119798+02:00</td>\n",
 | 
						|
       "      <td>41</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>10655</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>35c88f2db8a63d7474e46eb8ca9260e7</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>478</td>\n",
 | 
						|
       "      <td>9.5</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>273</td>\n",
 | 
						|
       "      <td>131</td>\n",
 | 
						|
       "      <td>2020-09-03 13:21:22.711773+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:21:22.711773+02:00</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>471</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>8a179671ab198e570e6a104c4451379f</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>20873</td>\n",
 | 
						|
       "      <td>11.5</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>275</td>\n",
 | 
						|
       "      <td>137</td>\n",
 | 
						|
       "      <td>2020-09-03 14:46:33.589030+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 14:46:33.589030+02:00</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>20825</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>ee83779ce29e67ad251e40234b426d6a</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>157142</td>\n",
 | 
						|
       "      <td>8.0</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>82519</td>\n",
 | 
						|
       "      <td>9</td>\n",
 | 
						|
       "      <td>2022-01-28 19:29:23.525722+01:00</td>\n",
 | 
						|
       "      <td>2022-01-28 19:29:23.525722+01:00</td>\n",
 | 
						|
       "      <td>5</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>156773</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>d865383579314b791aa4bcf3fb418f17</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>1341</td>\n",
 | 
						|
       "      <td>8.5</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>9</td>\n",
 | 
						|
       "      <td>93</td>\n",
 | 
						|
       "      <td>2020-09-03 13:29:30.773089+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:29:30.773089+02:00</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>1175</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>f1c4689bc47dee6f60b56d74b593dd46</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "       id  amount  is_full_price  representation_id  pricing_formula_id  \\\n",
 | 
						|
       "0   10682     9.0          False                914                 114   \n",
 | 
						|
       "1     478     9.5          False                273                 131   \n",
 | 
						|
       "2   20873    11.5          False                275                 137   \n",
 | 
						|
       "3  157142     8.0          False              82519                   9   \n",
 | 
						|
       "4    1341     8.5          False                  9                  93   \n",
 | 
						|
       "\n",
 | 
						|
       "                         created_at                        updated_at  \\\n",
 | 
						|
       "0  2020-09-03 14:09:43.119798+02:00  2020-09-03 14:09:43.119798+02:00   \n",
 | 
						|
       "1  2020-09-03 13:21:22.711773+02:00  2020-09-03 13:21:22.711773+02:00   \n",
 | 
						|
       "2  2020-09-03 14:46:33.589030+02:00  2020-09-03 14:46:33.589030+02:00   \n",
 | 
						|
       "3  2022-01-28 19:29:23.525722+01:00  2022-01-28 19:29:23.525722+01:00   \n",
 | 
						|
       "4  2020-09-03 13:29:30.773089+02:00  2020-09-03 13:29:30.773089+02:00   \n",
 | 
						|
       "\n",
 | 
						|
       "   category_id  apply_price  products_group_id  product_pack_id  extra_field  \\\n",
 | 
						|
       "0           41          0.0              10655                1          NaN   \n",
 | 
						|
       "1            1          0.0                471                1          NaN   \n",
 | 
						|
       "2            1          0.0              20825                1          NaN   \n",
 | 
						|
       "3            5          0.0             156773                1          NaN   \n",
 | 
						|
       "4            1          0.0               1175                1          NaN   \n",
 | 
						|
       "\n",
 | 
						|
       "   amount_consumption                        identifier  \n",
 | 
						|
       "0                 NaN  35c88f2db8a63d7474e46eb8ca9260e7  \n",
 | 
						|
       "1                 NaN  8a179671ab198e570e6a104c4451379f  \n",
 | 
						|
       "2                 NaN  ee83779ce29e67ad251e40234b426d6a  \n",
 | 
						|
       "3                 NaN  d865383579314b791aa4bcf3fb418f17  \n",
 | 
						|
       "4                 NaN  f1c4689bc47dee6f60b56d74b593dd46  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 15,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "products = display_databases(\"1products.csv\")\n",
 | 
						|
    "print(\"Number of columns : \", len(products.columns))\n",
 | 
						|
    "products.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 16,
 | 
						|
   "id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Number of columns :  12\n",
 | 
						|
      "Columns :  Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
 | 
						|
      "       'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
 | 
						|
      "       'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
 | 
						|
      "      dtype='object')\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>representation_id</th>\n",
 | 
						|
       "      <th>pricing_formula_id</th>\n",
 | 
						|
       "      <th>category_id</th>\n",
 | 
						|
       "      <th>products_group_id</th>\n",
 | 
						|
       "      <th>product_pack_id</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "      <th>amount</th>\n",
 | 
						|
       "      <th>is_full_price</th>\n",
 | 
						|
       "      <th>apply_price</th>\n",
 | 
						|
       "      <th>extra_field</th>\n",
 | 
						|
       "      <th>amount_consumption</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>10682</td>\n",
 | 
						|
       "      <td>914</td>\n",
 | 
						|
       "      <td>114</td>\n",
 | 
						|
       "      <td>41</td>\n",
 | 
						|
       "      <td>10655</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>35c88f2db8a63d7474e46eb8ca9260e7</td>\n",
 | 
						|
       "      <td>9.0</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>478</td>\n",
 | 
						|
       "      <td>273</td>\n",
 | 
						|
       "      <td>131</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>471</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>8a179671ab198e570e6a104c4451379f</td>\n",
 | 
						|
       "      <td>9.5</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>20873</td>\n",
 | 
						|
       "      <td>275</td>\n",
 | 
						|
       "      <td>137</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>20825</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>ee83779ce29e67ad251e40234b426d6a</td>\n",
 | 
						|
       "      <td>11.5</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>157142</td>\n",
 | 
						|
       "      <td>82519</td>\n",
 | 
						|
       "      <td>9</td>\n",
 | 
						|
       "      <td>5</td>\n",
 | 
						|
       "      <td>156773</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>d865383579314b791aa4bcf3fb418f17</td>\n",
 | 
						|
       "      <td>8.0</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>1341</td>\n",
 | 
						|
       "      <td>9</td>\n",
 | 
						|
       "      <td>93</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>1175</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>f1c4689bc47dee6f60b56d74b593dd46</td>\n",
 | 
						|
       "      <td>8.5</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>0.0</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "       id  representation_id  pricing_formula_id  category_id  \\\n",
 | 
						|
       "0   10682                914                 114           41   \n",
 | 
						|
       "1     478                273                 131            1   \n",
 | 
						|
       "2   20873                275                 137            1   \n",
 | 
						|
       "3  157142              82519                   9            5   \n",
 | 
						|
       "4    1341                  9                  93            1   \n",
 | 
						|
       "\n",
 | 
						|
       "   products_group_id  product_pack_id                        identifier  \\\n",
 | 
						|
       "0              10655                1  35c88f2db8a63d7474e46eb8ca9260e7   \n",
 | 
						|
       "1                471                1  8a179671ab198e570e6a104c4451379f   \n",
 | 
						|
       "2              20825                1  ee83779ce29e67ad251e40234b426d6a   \n",
 | 
						|
       "3             156773                1  d865383579314b791aa4bcf3fb418f17   \n",
 | 
						|
       "4               1175                1  f1c4689bc47dee6f60b56d74b593dd46   \n",
 | 
						|
       "\n",
 | 
						|
       "   amount  is_full_price  apply_price  extra_field  amount_consumption  \n",
 | 
						|
       "0     9.0          False          0.0          NaN                 NaN  \n",
 | 
						|
       "1     9.5          False          0.0          NaN                 NaN  \n",
 | 
						|
       "2    11.5          False          0.0          NaN                 NaN  \n",
 | 
						|
       "3     8.0          False          0.0          NaN                 NaN  \n",
 | 
						|
       "4     8.5          False          0.0          NaN                 NaN  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 16,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "products = remove_horodates(products)\n",
 | 
						|
    "print(\"Number of columns : \", len(products.columns))\n",
 | 
						|
    "products = order_columns_id(products)\n",
 | 
						|
    "print(\"Columns : \", products.columns)\n",
 | 
						|
    "products.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 17,
 | 
						|
   "id": "a383474f-7da9-422c-bb69-3f0cc0b7053f",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "id                      int64\n",
 | 
						|
      "representation_id       int64\n",
 | 
						|
      "pricing_formula_id      int64\n",
 | 
						|
      "category_id             int64\n",
 | 
						|
      "products_group_id       int64\n",
 | 
						|
      "product_pack_id         int64\n",
 | 
						|
      "identifier             object\n",
 | 
						|
      "amount                float64\n",
 | 
						|
      "is_full_price            bool\n",
 | 
						|
      "apply_price           float64\n",
 | 
						|
      "extra_field           float64\n",
 | 
						|
      "amount_consumption    float64\n",
 | 
						|
      "dtype: object\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "print(products.dtypes)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 18,
 | 
						|
   "id": "460749ac-aa26-4216-8667-518546f72f72",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "id                      0.0\n",
 | 
						|
      "representation_id       0.0\n",
 | 
						|
      "pricing_formula_id      0.0\n",
 | 
						|
      "category_id             0.0\n",
 | 
						|
      "products_group_id       0.0\n",
 | 
						|
      "product_pack_id         0.0\n",
 | 
						|
      "identifier              0.0\n",
 | 
						|
      "amount                  0.0\n",
 | 
						|
      "is_full_price           0.0\n",
 | 
						|
      "apply_price             0.0\n",
 | 
						|
      "extra_field           100.0\n",
 | 
						|
      "amount_consumption    100.0\n",
 | 
						|
      "dtype: float64\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "percent_missing = products.isna().sum() * 100 / len(products)\n",
 | 
						|
    "print(percent_missing)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "ebcb48ab-adad-42e5-b5d7-7275771cd200",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep analysis of categories.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 26,
 | 
						|
   "id": "3efce2b6-2d2f-4da9-98ed-1aae17da624c",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "name_dataset = '1categories.csv'"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 27,
 | 
						|
   "id": "38aa39fd-58af-4fb8-98f2-4269dbaf35de",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "File path :  bdc2324-data/1/1categories.csv\n",
 | 
						|
      "Shape :  (27, 7)\n",
 | 
						|
      "Number of columns :  7\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>created_at</th>\n",
 | 
						|
       "      <th>updated_at</th>\n",
 | 
						|
       "      <th>extra_field</th>\n",
 | 
						|
       "      <th>quota</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>30</td>\n",
 | 
						|
       "      <td>en nb entrées gr</td>\n",
 | 
						|
       "      <td>2020-09-03 13:21:20.019202+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:21:20.019202+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>849ab2791a14f5fc2bb4d87ab2b78bf6</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>16</td>\n",
 | 
						|
       "      <td>indiv activité enfant</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:23.306968+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:23.306968+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>425fd2f01984cc4ba030c1be98f42c33</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>39</td>\n",
 | 
						|
       "      <td>indiv activité gr</td>\n",
 | 
						|
       "      <td>2020-09-03 13:21:20.029901+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:21:20.029901+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>9244dd3738788db0d22a5d0afe687b69</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>1108</td>\n",
 | 
						|
       "      <td>groupe forfait adulte</td>\n",
 | 
						|
       "      <td>2020-09-19 02:06:43.145697+02:00</td>\n",
 | 
						|
       "      <td>2020-09-19 02:06:43.145697+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>3edda20c877a93b5ff883827238eb711</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>6</td>\n",
 | 
						|
       "      <td>groupe forfait entrées tr</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:23.264997+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:23.264997+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>ff48df4b2dd5a14116bf4d280b31621e</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "     id                       name                        created_at  \\\n",
 | 
						|
       "0    30           en nb entrées gr  2020-09-03 13:21:20.019202+02:00   \n",
 | 
						|
       "1    16      indiv activité enfant  2020-09-03 13:11:23.306968+02:00   \n",
 | 
						|
       "2    39          indiv activité gr  2020-09-03 13:21:20.029901+02:00   \n",
 | 
						|
       "3  1108      groupe forfait adulte  2020-09-19 02:06:43.145697+02:00   \n",
 | 
						|
       "4     6  groupe forfait entrées tr  2020-09-03 13:11:23.264997+02:00   \n",
 | 
						|
       "\n",
 | 
						|
       "                         updated_at  extra_field  quota  \\\n",
 | 
						|
       "0  2020-09-03 13:21:20.019202+02:00          NaN    NaN   \n",
 | 
						|
       "1  2020-09-03 13:11:23.306968+02:00          NaN    NaN   \n",
 | 
						|
       "2  2020-09-03 13:21:20.029901+02:00          NaN    NaN   \n",
 | 
						|
       "3  2020-09-19 02:06:43.145697+02:00          NaN    NaN   \n",
 | 
						|
       "4  2020-09-03 13:11:23.264997+02:00          NaN    NaN   \n",
 | 
						|
       "\n",
 | 
						|
       "                         identifier  \n",
 | 
						|
       "0  849ab2791a14f5fc2bb4d87ab2b78bf6  \n",
 | 
						|
       "1  425fd2f01984cc4ba030c1be98f42c33  \n",
 | 
						|
       "2  9244dd3738788db0d22a5d0afe687b69  \n",
 | 
						|
       "3  3edda20c877a93b5ff883827238eb711  \n",
 | 
						|
       "4  ff48df4b2dd5a14116bf4d280b31621e  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 27,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = display_databases(name_dataset)\n",
 | 
						|
    "print(\"Number of columns : \", len(df.columns))\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 28,
 | 
						|
   "id": "99eb6d14-8b4b-4d55-8fc7-ddf2726096f4",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Number of columns :  5\n",
 | 
						|
      "Columns :  Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
 | 
						|
      "Percent of NA for each column :  id               0.000000\n",
 | 
						|
      "identifier       0.000000\n",
 | 
						|
      "name             3.703704\n",
 | 
						|
      "extra_field    100.000000\n",
 | 
						|
      "quota          100.000000\n",
 | 
						|
      "dtype: float64\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>extra_field</th>\n",
 | 
						|
       "      <th>quota</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>30</td>\n",
 | 
						|
       "      <td>849ab2791a14f5fc2bb4d87ab2b78bf6</td>\n",
 | 
						|
       "      <td>en nb entrées gr</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>16</td>\n",
 | 
						|
       "      <td>425fd2f01984cc4ba030c1be98f42c33</td>\n",
 | 
						|
       "      <td>indiv activité enfant</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>39</td>\n",
 | 
						|
       "      <td>9244dd3738788db0d22a5d0afe687b69</td>\n",
 | 
						|
       "      <td>indiv activité gr</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>1108</td>\n",
 | 
						|
       "      <td>3edda20c877a93b5ff883827238eb711</td>\n",
 | 
						|
       "      <td>groupe forfait adulte</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>6</td>\n",
 | 
						|
       "      <td>ff48df4b2dd5a14116bf4d280b31621e</td>\n",
 | 
						|
       "      <td>groupe forfait entrées tr</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "     id                        identifier                       name  \\\n",
 | 
						|
       "0    30  849ab2791a14f5fc2bb4d87ab2b78bf6           en nb entrées gr   \n",
 | 
						|
       "1    16  425fd2f01984cc4ba030c1be98f42c33      indiv activité enfant   \n",
 | 
						|
       "2    39  9244dd3738788db0d22a5d0afe687b69          indiv activité gr   \n",
 | 
						|
       "3  1108  3edda20c877a93b5ff883827238eb711      groupe forfait adulte   \n",
 | 
						|
       "4     6  ff48df4b2dd5a14116bf4d280b31621e  groupe forfait entrées tr   \n",
 | 
						|
       "\n",
 | 
						|
       "   extra_field  quota  \n",
 | 
						|
       "0          NaN    NaN  \n",
 | 
						|
       "1          NaN    NaN  \n",
 | 
						|
       "2          NaN    NaN  \n",
 | 
						|
       "3          NaN    NaN  \n",
 | 
						|
       "4          NaN    NaN  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 28,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = process_df(df)\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 29,
 | 
						|
   "id": "c5f39cc9-dff8-452c-9a3e-9f7df81a8a19",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "id               int64\n",
 | 
						|
       "identifier      object\n",
 | 
						|
       "name            object\n",
 | 
						|
       "extra_field    float64\n",
 | 
						|
       "quota          float64\n",
 | 
						|
       "dtype: object"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 29,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df.dtypes"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "c4cb0b37-2262-45c0-97be-b12c503016e3",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep analysis of type_of_categories.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "3b4a3af9-ed12-43ec-b17e-fd425b238265",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep analysis of representation_category_capacities.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "135966fb-aab1-48d7-bb4c-39a53ee643ca",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep analysis of representations.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "b480f39f-d5c7-4ded-8f64-ea8ac31f5db5",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep analysis of events.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 31,
 | 
						|
   "id": "2d52d6da-cca5-4abd-be05-2f00fd3eca8e",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "name_dataset = '1events.csv'"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 32,
 | 
						|
   "id": "6cab507d-8b11-404d-9286-5cc205228af9",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "File path :  bdc2324-data/1/1events.csv\n",
 | 
						|
      "Shape :  (1232, 12)\n",
 | 
						|
      "Number of columns :  12\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>created_at</th>\n",
 | 
						|
       "      <th>updated_at</th>\n",
 | 
						|
       "      <th>season_id</th>\n",
 | 
						|
       "      <th>facility_id</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>event_type_id</th>\n",
 | 
						|
       "      <th>manual_added</th>\n",
 | 
						|
       "      <th>is_display</th>\n",
 | 
						|
       "      <th>event_type_key_id</th>\n",
 | 
						|
       "      <th>facility_key_id</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>192</td>\n",
 | 
						|
       "      <td>2020-09-03 13:36:42.216991+02:00</td>\n",
 | 
						|
       "      <td>2021-11-02 15:06:40.663219+01:00</td>\n",
 | 
						|
       "      <td>16</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>frontières</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>c1cecd093146068fd57896e254e98170</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>30329</td>\n",
 | 
						|
       "      <td>2023-11-04 02:50:34.602462+01:00</td>\n",
 | 
						|
       "      <td>2023-11-04 02:52:26.138154+01:00</td>\n",
 | 
						|
       "      <td>2767</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>visite guidée une autre histoire du monde (1h00)</td>\n",
 | 
						|
       "      <td>5</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "      <td>5</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>f510a6710878d7aca36e71c54abab525</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>161</td>\n",
 | 
						|
       "      <td>2020-09-03 13:29:27.944002+02:00</td>\n",
 | 
						|
       "      <td>2021-11-02 15:06:40.652026+01:00</td>\n",
 | 
						|
       "      <td>16</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>visite contée les chercheurs d'or indiv</td>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>21177fa9acad1ae2b1f595690fb853d3</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>5957</td>\n",
 | 
						|
       "      <td>2021-07-31 11:16:42.575583+02:00</td>\n",
 | 
						|
       "      <td>2021-11-02 15:06:40.663219+01:00</td>\n",
 | 
						|
       "      <td>582</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>we dreamt of utopia and we woke up screaming.</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>962601f1eb153d45d49437f8fe839f7f</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>8337</td>\n",
 | 
						|
       "      <td>2021-08-17 13:40:34.111923+02:00</td>\n",
 | 
						|
       "      <td>2021-11-02 15:06:40.663219+01:00</td>\n",
 | 
						|
       "      <td>582</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>jeff koons épisodes 4</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>bfa22f5a2364a2dacfc45cca1c8d3215</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "      id                        created_at                        updated_at  \\\n",
 | 
						|
       "0    192  2020-09-03 13:36:42.216991+02:00  2021-11-02 15:06:40.663219+01:00   \n",
 | 
						|
       "1  30329  2023-11-04 02:50:34.602462+01:00  2023-11-04 02:52:26.138154+01:00   \n",
 | 
						|
       "2    161  2020-09-03 13:29:27.944002+02:00  2021-11-02 15:06:40.652026+01:00   \n",
 | 
						|
       "3   5957  2021-07-31 11:16:42.575583+02:00  2021-11-02 15:06:40.663219+01:00   \n",
 | 
						|
       "4   8337  2021-08-17 13:40:34.111923+02:00  2021-11-02 15:06:40.663219+01:00   \n",
 | 
						|
       "\n",
 | 
						|
       "   season_id  facility_id                                              name  \\\n",
 | 
						|
       "0         16            1                                        frontières   \n",
 | 
						|
       "1       2767            1  visite guidée une autre histoire du monde (1h00)   \n",
 | 
						|
       "2         16            1           visite contée les chercheurs d'or indiv   \n",
 | 
						|
       "3        582            1     we dreamt of utopia and we woke up screaming.   \n",
 | 
						|
       "4        582            1                             jeff koons épisodes 4   \n",
 | 
						|
       "\n",
 | 
						|
       "   event_type_id  manual_added  is_display  event_type_key_id  \\\n",
 | 
						|
       "0              4         False        True                  4   \n",
 | 
						|
       "1              5         False        True                  5   \n",
 | 
						|
       "2              2         False        True                  2   \n",
 | 
						|
       "3              4         False        True                  4   \n",
 | 
						|
       "4              4         False        True                  4   \n",
 | 
						|
       "\n",
 | 
						|
       "   facility_key_id                        identifier  \n",
 | 
						|
       "0                1  c1cecd093146068fd57896e254e98170  \n",
 | 
						|
       "1                1  f510a6710878d7aca36e71c54abab525  \n",
 | 
						|
       "2                1  21177fa9acad1ae2b1f595690fb853d3  \n",
 | 
						|
       "3                1  962601f1eb153d45d49437f8fe839f7f  \n",
 | 
						|
       "4                1  bfa22f5a2364a2dacfc45cca1c8d3215  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 32,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = display_databases(name_dataset)\n",
 | 
						|
    "print(\"Number of columns : \", len(df.columns))\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 33,
 | 
						|
   "id": "9fe57873-8108-44c9-b8a5-f58d3cbb6d17",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Number of columns :  10\n",
 | 
						|
      "Columns :  Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
 | 
						|
      "       'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
 | 
						|
      "      dtype='object')\n",
 | 
						|
      "Percent of NA for each column :  id                   0.000000\n",
 | 
						|
      "season_id            0.000000\n",
 | 
						|
      "facility_id          0.000000\n",
 | 
						|
      "event_type_id        0.000000\n",
 | 
						|
      "event_type_key_id    0.000000\n",
 | 
						|
      "facility_key_id      0.000000\n",
 | 
						|
      "identifier           0.000000\n",
 | 
						|
      "name                 0.974026\n",
 | 
						|
      "manual_added         0.000000\n",
 | 
						|
      "is_display           0.000000\n",
 | 
						|
      "dtype: float64\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>season_id</th>\n",
 | 
						|
       "      <th>facility_id</th>\n",
 | 
						|
       "      <th>event_type_id</th>\n",
 | 
						|
       "      <th>event_type_key_id</th>\n",
 | 
						|
       "      <th>facility_key_id</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>manual_added</th>\n",
 | 
						|
       "      <th>is_display</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>192</td>\n",
 | 
						|
       "      <td>16</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>c1cecd093146068fd57896e254e98170</td>\n",
 | 
						|
       "      <td>frontières</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>30329</td>\n",
 | 
						|
       "      <td>2767</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>5</td>\n",
 | 
						|
       "      <td>5</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>f510a6710878d7aca36e71c54abab525</td>\n",
 | 
						|
       "      <td>visite guidée une autre histoire du monde (1h00)</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>161</td>\n",
 | 
						|
       "      <td>16</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>21177fa9acad1ae2b1f595690fb853d3</td>\n",
 | 
						|
       "      <td>visite contée les chercheurs d'or indiv</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>5957</td>\n",
 | 
						|
       "      <td>582</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>962601f1eb153d45d49437f8fe839f7f</td>\n",
 | 
						|
       "      <td>we dreamt of utopia and we woke up screaming.</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>8337</td>\n",
 | 
						|
       "      <td>582</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>bfa22f5a2364a2dacfc45cca1c8d3215</td>\n",
 | 
						|
       "      <td>jeff koons épisodes 4</td>\n",
 | 
						|
       "      <td>False</td>\n",
 | 
						|
       "      <td>True</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "      id  season_id  facility_id  event_type_id  event_type_key_id  \\\n",
 | 
						|
       "0    192         16            1              4                  4   \n",
 | 
						|
       "1  30329       2767            1              5                  5   \n",
 | 
						|
       "2    161         16            1              2                  2   \n",
 | 
						|
       "3   5957        582            1              4                  4   \n",
 | 
						|
       "4   8337        582            1              4                  4   \n",
 | 
						|
       "\n",
 | 
						|
       "   facility_key_id                        identifier  \\\n",
 | 
						|
       "0                1  c1cecd093146068fd57896e254e98170   \n",
 | 
						|
       "1                1  f510a6710878d7aca36e71c54abab525   \n",
 | 
						|
       "2                1  21177fa9acad1ae2b1f595690fb853d3   \n",
 | 
						|
       "3                1  962601f1eb153d45d49437f8fe839f7f   \n",
 | 
						|
       "4                1  bfa22f5a2364a2dacfc45cca1c8d3215   \n",
 | 
						|
       "\n",
 | 
						|
       "                                               name  manual_added  is_display  \n",
 | 
						|
       "0                                        frontières         False        True  \n",
 | 
						|
       "1  visite guidée une autre histoire du monde (1h00)         False        True  \n",
 | 
						|
       "2           visite contée les chercheurs d'or indiv         False        True  \n",
 | 
						|
       "3     we dreamt of utopia and we woke up screaming.         False        True  \n",
 | 
						|
       "4                             jeff koons épisodes 4         False        True  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 33,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = process_df(df)\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 34,
 | 
						|
   "id": "7fd9e5bd-baac-4b3b-9ffb-5a9baa18399b",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "id                    int64\n",
 | 
						|
       "season_id             int64\n",
 | 
						|
       "facility_id           int64\n",
 | 
						|
       "event_type_id         int64\n",
 | 
						|
       "event_type_key_id     int64\n",
 | 
						|
       "facility_key_id       int64\n",
 | 
						|
       "identifier           object\n",
 | 
						|
       "name                 object\n",
 | 
						|
       "manual_added           bool\n",
 | 
						|
       "is_display             bool\n",
 | 
						|
       "dtype: object"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 34,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df.dtypes"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "24186efa-5908-4b03-bf52-96415fc8bd54",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep analysis of event_types.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 35,
 | 
						|
   "id": "90ab62d4-a086-4469-961c-67eefb375388",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "name_dataset = '1event_types.csv'"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 36,
 | 
						|
   "id": "58db1751-fd56-4c28-b49e-bc8235bb0dc8",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "File path :  bdc2324-data/1/1event_types.csv\n",
 | 
						|
      "Shape :  (9, 6)\n",
 | 
						|
      "Number of columns :  6\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>created_at</th>\n",
 | 
						|
       "      <th>updated_at</th>\n",
 | 
						|
       "      <th>fidelity_delay</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>standard</td>\n",
 | 
						|
       "      <td>2020-09-03 12:24:22.574262+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 12:24:22.574262+02:00</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>c00f0c4675b91fb8b918e4079a0b1bac</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>66</td>\n",
 | 
						|
       "      <td>package</td>\n",
 | 
						|
       "      <td>2020-09-03 14:05:04.648137+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 14:05:04.648137+02:00</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>efe90a8e604a7c840e88d03a67f6b7d8</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>83</td>\n",
 | 
						|
       "      <td>guide multimédias</td>\n",
 | 
						|
       "      <td>2020-09-03 14:15:17.252539+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 14:15:17.252539+02:00</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>ee14c62b3b9f6c7dd5401685a18e4460</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>3</td>\n",
 | 
						|
       "      <td>non défini</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:23.117024+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:23.117024+02:00</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>2723</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>2021-12-22 09:45:47.715105+01:00</td>\n",
 | 
						|
       "      <td>2021-12-22 09:45:47.715105+01:00</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "     id               name                        created_at  \\\n",
 | 
						|
       "0     1           standard  2020-09-03 12:24:22.574262+02:00   \n",
 | 
						|
       "1    66            package  2020-09-03 14:05:04.648137+02:00   \n",
 | 
						|
       "2    83  guide multimédias  2020-09-03 14:15:17.252539+02:00   \n",
 | 
						|
       "3     3         non défini  2020-09-03 13:11:23.117024+02:00   \n",
 | 
						|
       "4  2723                NaN  2021-12-22 09:45:47.715105+01:00   \n",
 | 
						|
       "\n",
 | 
						|
       "                         updated_at  fidelity_delay  \\\n",
 | 
						|
       "0  2020-09-03 12:24:22.574262+02:00              36   \n",
 | 
						|
       "1  2020-09-03 14:05:04.648137+02:00              36   \n",
 | 
						|
       "2  2020-09-03 14:15:17.252539+02:00              36   \n",
 | 
						|
       "3  2020-09-03 13:11:23.117024+02:00              36   \n",
 | 
						|
       "4  2021-12-22 09:45:47.715105+01:00              36   \n",
 | 
						|
       "\n",
 | 
						|
       "                         identifier  \n",
 | 
						|
       "0  c00f0c4675b91fb8b918e4079a0b1bac  \n",
 | 
						|
       "1  efe90a8e604a7c840e88d03a67f6b7d8  \n",
 | 
						|
       "2  ee14c62b3b9f6c7dd5401685a18e4460  \n",
 | 
						|
       "3  52ff3466787b4d538407372e5f7afe0f  \n",
 | 
						|
       "4  d41d8cd98f00b204e9800998ecf8427e  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 36,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = display_databases(name_dataset)\n",
 | 
						|
    "print(\"Number of columns : \", len(df.columns))\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 37,
 | 
						|
   "id": "ac93382c-0b5f-462d-8021-0dd1e7201b8c",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Number of columns :  4\n",
 | 
						|
      "Columns :  Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
 | 
						|
      "Percent of NA for each column :  id                 0.000000\n",
 | 
						|
      "fidelity_delay     0.000000\n",
 | 
						|
      "identifier         0.000000\n",
 | 
						|
      "name              11.111111\n",
 | 
						|
      "dtype: float64\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>fidelity_delay</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>c00f0c4675b91fb8b918e4079a0b1bac</td>\n",
 | 
						|
       "      <td>standard</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>66</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>efe90a8e604a7c840e88d03a67f6b7d8</td>\n",
 | 
						|
       "      <td>package</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>83</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>ee14c62b3b9f6c7dd5401685a18e4460</td>\n",
 | 
						|
       "      <td>guide multimédias</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>3</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
 | 
						|
       "      <td>non défini</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>2723</td>\n",
 | 
						|
       "      <td>36</td>\n",
 | 
						|
       "      <td>d41d8cd98f00b204e9800998ecf8427e</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "     id  fidelity_delay                        identifier               name\n",
 | 
						|
       "0     1              36  c00f0c4675b91fb8b918e4079a0b1bac           standard\n",
 | 
						|
       "1    66              36  efe90a8e604a7c840e88d03a67f6b7d8            package\n",
 | 
						|
       "2    83              36  ee14c62b3b9f6c7dd5401685a18e4460  guide multimédias\n",
 | 
						|
       "3     3              36  52ff3466787b4d538407372e5f7afe0f         non défini\n",
 | 
						|
       "4  2723              36  d41d8cd98f00b204e9800998ecf8427e                NaN"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 37,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = process_df(df)\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 38,
 | 
						|
   "id": "18cbd630-3c7d-49e1-932b-9460badf3758",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "id                 int64\n",
 | 
						|
       "fidelity_delay     int64\n",
 | 
						|
       "identifier        object\n",
 | 
						|
       "name              object\n",
 | 
						|
       "dtype: object"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 38,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df.dtypes"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "5847a441-31b9-4802-a5ae-90d8c6d6e153",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep analysis of seasons.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 39,
 | 
						|
   "id": "ae544dcc-f23d-4216-bb5b-597cc1b3765e",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "name_dataset = '1seasons.csv'"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 40,
 | 
						|
   "id": "1ac97963-9208-4329-be41-d71a5797487f",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "File path :  bdc2324-data/1/1seasons.csv\n",
 | 
						|
      "Shape :  (13, 6)\n",
 | 
						|
      "Number of columns :  6\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>created_at</th>\n",
 | 
						|
       "      <th>updated_at</th>\n",
 | 
						|
       "      <th>start_date_time</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>943</td>\n",
 | 
						|
       "      <td>2013</td>\n",
 | 
						|
       "      <td>2021-07-29 08:55:33.282607+02:00</td>\n",
 | 
						|
       "      <td>2021-07-29 08:55:33.282607+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>8038da89e49ac5eabb489cfc6cea9fc1</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>129</td>\n",
 | 
						|
       "      <td>2014</td>\n",
 | 
						|
       "      <td>2020-09-03 15:13:08.105567+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 15:13:08.105567+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>cee8d6b7ce52554fd70354e37bbf44a2</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>3</td>\n",
 | 
						|
       "      <td>2015</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:19.405037+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:19.405037+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>65d2ea03425887a717c435081cfc5dbb</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>2016</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:19.401001+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:19.401001+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>95192c98732387165bf8e396c0f2dad2</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>2017</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:19.409005+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:19.409005+02:00</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>8d8818c8e140c64c743113f563cf750f</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "    id  name                        created_at  \\\n",
 | 
						|
       "0  943  2013  2021-07-29 08:55:33.282607+02:00   \n",
 | 
						|
       "1  129  2014  2020-09-03 15:13:08.105567+02:00   \n",
 | 
						|
       "2    3  2015  2020-09-03 13:11:19.405037+02:00   \n",
 | 
						|
       "3    2  2016  2020-09-03 13:11:19.401001+02:00   \n",
 | 
						|
       "4    4  2017  2020-09-03 13:11:19.409005+02:00   \n",
 | 
						|
       "\n",
 | 
						|
       "                         updated_at  start_date_time  \\\n",
 | 
						|
       "0  2021-07-29 08:55:33.282607+02:00              NaN   \n",
 | 
						|
       "1  2020-09-03 15:13:08.105567+02:00              NaN   \n",
 | 
						|
       "2  2020-09-03 13:11:19.405037+02:00              NaN   \n",
 | 
						|
       "3  2020-09-03 13:11:19.401001+02:00              NaN   \n",
 | 
						|
       "4  2020-09-03 13:11:19.409005+02:00              NaN   \n",
 | 
						|
       "\n",
 | 
						|
       "                         identifier  \n",
 | 
						|
       "0  8038da89e49ac5eabb489cfc6cea9fc1  \n",
 | 
						|
       "1  cee8d6b7ce52554fd70354e37bbf44a2  \n",
 | 
						|
       "2  65d2ea03425887a717c435081cfc5dbb  \n",
 | 
						|
       "3  95192c98732387165bf8e396c0f2dad2  \n",
 | 
						|
       "4  8d8818c8e140c64c743113f563cf750f  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 40,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = display_databases(name_dataset)\n",
 | 
						|
    "print(\"Number of columns : \", len(df.columns))\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 41,
 | 
						|
   "id": "b4593d46-105c-47dd-aa71-babd8e63e65b",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Number of columns :  4\n",
 | 
						|
      "Columns :  Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
 | 
						|
      "Percent of NA for each column :  id                   0.000000\n",
 | 
						|
      "identifier           0.000000\n",
 | 
						|
      "name                 7.692308\n",
 | 
						|
      "start_date_time    100.000000\n",
 | 
						|
      "dtype: float64\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>start_date_time</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>943</td>\n",
 | 
						|
       "      <td>8038da89e49ac5eabb489cfc6cea9fc1</td>\n",
 | 
						|
       "      <td>2013</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>129</td>\n",
 | 
						|
       "      <td>cee8d6b7ce52554fd70354e37bbf44a2</td>\n",
 | 
						|
       "      <td>2014</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>2</th>\n",
 | 
						|
       "      <td>3</td>\n",
 | 
						|
       "      <td>65d2ea03425887a717c435081cfc5dbb</td>\n",
 | 
						|
       "      <td>2015</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>3</th>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>95192c98732387165bf8e396c0f2dad2</td>\n",
 | 
						|
       "      <td>2016</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>4</th>\n",
 | 
						|
       "      <td>4</td>\n",
 | 
						|
       "      <td>8d8818c8e140c64c743113f563cf750f</td>\n",
 | 
						|
       "      <td>2017</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "    id                        identifier  name  start_date_time\n",
 | 
						|
       "0  943  8038da89e49ac5eabb489cfc6cea9fc1  2013              NaN\n",
 | 
						|
       "1  129  cee8d6b7ce52554fd70354e37bbf44a2  2014              NaN\n",
 | 
						|
       "2    3  65d2ea03425887a717c435081cfc5dbb  2015              NaN\n",
 | 
						|
       "3    2  95192c98732387165bf8e396c0f2dad2  2016              NaN\n",
 | 
						|
       "4    4  8d8818c8e140c64c743113f563cf750f  2017              NaN"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 41,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = process_df(df)\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 42,
 | 
						|
   "id": "5d3b096d-8e73-4514-94e5-f2dcd4d0a89c",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "id                   int64\n",
 | 
						|
       "identifier          object\n",
 | 
						|
       "name                object\n",
 | 
						|
       "start_date_time    float64\n",
 | 
						|
       "dtype: object"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 42,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df.dtypes"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "a7b00bc7-eae6-457c-ac68-a4a55a6d1c8c",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "#### Deep Analysis of facilities.csv"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 43,
 | 
						|
   "id": "d95ef015-d44c-4353-8761-771b910d21c9",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "name_dataset = '1facilities.csv'"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 44,
 | 
						|
   "id": "ef5fe794-8df7-4f27-8554-ecdc4074ac0b",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "File path :  bdc2324-data/1/1facilities.csv\n",
 | 
						|
      "Shape :  (2, 7)\n",
 | 
						|
      "Number of columns :  7\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>created_at</th>\n",
 | 
						|
       "      <th>updated_at</th>\n",
 | 
						|
       "      <th>street_id</th>\n",
 | 
						|
       "      <th>fixed_capacity</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>non défini</td>\n",
 | 
						|
       "      <td>2020-09-03 13:16:35.293111+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:16:35.293111+02:00</td>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>mucem</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:23.133059+02:00</td>\n",
 | 
						|
       "      <td>2020-09-03 13:11:23.133059+02:00</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "      <td>702bd76fe3dd5dbcf118a6965a946f54</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "   id        name                        created_at  \\\n",
 | 
						|
       "0   2  non défini  2020-09-03 13:16:35.293111+02:00   \n",
 | 
						|
       "1   1       mucem  2020-09-03 13:11:23.133059+02:00   \n",
 | 
						|
       "\n",
 | 
						|
       "                         updated_at  street_id  fixed_capacity  \\\n",
 | 
						|
       "0  2020-09-03 13:16:35.293111+02:00          2             NaN   \n",
 | 
						|
       "1  2020-09-03 13:11:23.133059+02:00          1             NaN   \n",
 | 
						|
       "\n",
 | 
						|
       "                         identifier  \n",
 | 
						|
       "0  52ff3466787b4d538407372e5f7afe0f  \n",
 | 
						|
       "1  702bd76fe3dd5dbcf118a6965a946f54  "
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 44,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = display_databases(name_dataset)\n",
 | 
						|
    "print(\"Number of columns : \", len(df.columns))\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 45,
 | 
						|
   "id": "e3621201-fab9-49fd-95c1-0b9d5da76e50",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Number of columns :  5\n",
 | 
						|
      "Columns :  Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n",
 | 
						|
      "Percent of NA for each column :  id                  0.0\n",
 | 
						|
      "street_id           0.0\n",
 | 
						|
      "identifier          0.0\n",
 | 
						|
      "name                0.0\n",
 | 
						|
      "fixed_capacity    100.0\n",
 | 
						|
      "dtype: float64\n"
 | 
						|
     ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/html": [
 | 
						|
       "<div>\n",
 | 
						|
       "<style scoped>\n",
 | 
						|
       "    .dataframe tbody tr th:only-of-type {\n",
 | 
						|
       "        vertical-align: middle;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe tbody tr th {\n",
 | 
						|
       "        vertical-align: top;\n",
 | 
						|
       "    }\n",
 | 
						|
       "\n",
 | 
						|
       "    .dataframe thead th {\n",
 | 
						|
       "        text-align: right;\n",
 | 
						|
       "    }\n",
 | 
						|
       "</style>\n",
 | 
						|
       "<table border=\"1\" class=\"dataframe\">\n",
 | 
						|
       "  <thead>\n",
 | 
						|
       "    <tr style=\"text-align: right;\">\n",
 | 
						|
       "      <th></th>\n",
 | 
						|
       "      <th>id</th>\n",
 | 
						|
       "      <th>street_id</th>\n",
 | 
						|
       "      <th>identifier</th>\n",
 | 
						|
       "      <th>name</th>\n",
 | 
						|
       "      <th>fixed_capacity</th>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </thead>\n",
 | 
						|
       "  <tbody>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>0</th>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>2</td>\n",
 | 
						|
       "      <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
 | 
						|
       "      <td>non défini</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "    <tr>\n",
 | 
						|
       "      <th>1</th>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>1</td>\n",
 | 
						|
       "      <td>702bd76fe3dd5dbcf118a6965a946f54</td>\n",
 | 
						|
       "      <td>mucem</td>\n",
 | 
						|
       "      <td>NaN</td>\n",
 | 
						|
       "    </tr>\n",
 | 
						|
       "  </tbody>\n",
 | 
						|
       "</table>\n",
 | 
						|
       "</div>"
 | 
						|
      ],
 | 
						|
      "text/plain": [
 | 
						|
       "   id  street_id                        identifier        name  fixed_capacity\n",
 | 
						|
       "0   2          2  52ff3466787b4d538407372e5f7afe0f  non défini             NaN\n",
 | 
						|
       "1   1          1  702bd76fe3dd5dbcf118a6965a946f54       mucem             NaN"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 45,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df = process_df(df)\n",
 | 
						|
    "df.head()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 46,
 | 
						|
   "id": "1b198b92-8654-4531-a0dd-8f2e01c2e6c1",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "id                  int64\n",
 | 
						|
       "street_id           int64\n",
 | 
						|
       "identifier         object\n",
 | 
						|
       "name               object\n",
 | 
						|
       "fixed_capacity    float64\n",
 | 
						|
       "dtype: object"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 46,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "df.dtypes"
 | 
						|
   ]
 | 
						|
  }
 | 
						|
 ],
 | 
						|
 "metadata": {
 | 
						|
  "kernelspec": {
 | 
						|
   "display_name": "Python 3 (ipykernel)",
 | 
						|
   "language": "python",
 | 
						|
   "name": "python3"
 | 
						|
  },
 | 
						|
  "language_info": {
 | 
						|
   "codemirror_mode": {
 | 
						|
    "name": "ipython",
 | 
						|
    "version": 3
 | 
						|
   },
 | 
						|
   "file_extension": ".py",
 | 
						|
   "mimetype": "text/x-python",
 | 
						|
   "name": "python",
 | 
						|
   "nbconvert_exporter": "python",
 | 
						|
   "pygments_lexer": "ipython3",
 | 
						|
   "version": "3.10.13"
 | 
						|
  }
 | 
						|
 },
 | 
						|
 "nbformat": 4,
 | 
						|
 "nbformat_minor": 5
 | 
						|
}
 |