From 2727b010af49b8ed71be7e40cc470880702590bf Mon Sep 17 00:00:00 2001 From: Fanta RODRIGUE Date: Wed, 10 Jan 2024 19:18:34 +0100 Subject: [PATCH] revert 98f6efa2e5b12695172a5bfa5c26c6fc6bfa9e45 revert Identify common datasets --- Notebook_AR.ipynb | 703 ---------------------------------------------- 1 file changed, 703 deletions(-) delete mode 100644 Notebook_AR.ipynb diff --git a/Notebook_AR.ipynb b/Notebook_AR.ipynb deleted file mode 100644 index a3c291b..0000000 --- a/Notebook_AR.ipynb +++ /dev/null @@ -1,703 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "455cc769-1b3b-4fef-b395-e74a988ceed3", - "metadata": {}, - "source": [ - "## Notebook Alexis" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "20eeb149-6618-4ef2-9cfd-ff062950f36c", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import os\n", - "import s3fs" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "30494c5e-9649-4fff-8708-617544188b20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['bdc2324-data/1',\n", - " 'bdc2324-data/10',\n", - " 'bdc2324-data/101',\n", - " 'bdc2324-data/11',\n", - " 'bdc2324-data/12',\n", - " 'bdc2324-data/13',\n", - " 'bdc2324-data/14',\n", - " 'bdc2324-data/2',\n", - " 'bdc2324-data/3',\n", - " 'bdc2324-data/4',\n", - " 'bdc2324-data/5',\n", - " 'bdc2324-data/6',\n", - " 'bdc2324-data/7',\n", - " 'bdc2324-data/8',\n", - " 'bdc2324-data/9']" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create filesystem object\n", - "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", - "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", - "\n", - "BUCKET = \"bdc2324-data\"\n", - "fs.ls(BUCKET)" - ] - }, - { - "cell_type": "markdown", - "id": "2feffee9-9f23-4caa-8a01-9e4a93abbf5d", - "metadata": {}, - "source": [ - "### I. Analyse fichier 8" - ] - }, - { - "cell_type": "markdown", - "id": "f54ba449-2051-4acd-939d-d30abd5452fe", - "metadata": {}, - "source": [ - "This section describes the databases associated with company 8. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1cce705-46e1-42de-8e93-2ee15312d288", - "metadata": {}, - "outputs": [], - "source": [ - "directory_path = '8'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c", - "metadata": {}, - "outputs": [], - "source": [ - "# check the files in the directory\n", - "\n", - "objects = fs.ls(f'{BUCKET}/{directory_path}')\n", - "\n", - "for file in objects:\n", - " print(file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65cb38ad-52ae-4266-85d8-c47d81b00283", - "metadata": {}, - "outputs": [], - "source": [ - "def display_databases(file_name):\n", - " \"\"\"\n", - " This function returns the file from s3 storage\n", - " \"\"\"\n", - " file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n", - " print(\"File path : \", file_path)\n", - " with fs.open(file_path, mode=\"rb\") as file_in:\n", - " df = pd.read_csv(file_in, sep=\",\")\n", - " \n", - " print(\"Shape : \", df.shape)\n", - " return df\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "ddd545ef-7e9f-4696-962a-115294991641", - "metadata": {}, - "source": [ - "#### Lookt at campaigns files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0214d30d-5f83-498f-867f-e67b5793b731", - "metadata": {}, - "outputs": [], - "source": [ - "campaigns = display_databases(\"8campaigns.csv\")\n", - "campaigns.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7982be4-2c42-4a91-be5a-329a999644cc", - "metadata": {}, - "outputs": [], - "source": [ - "campaign_stats = display_databases(\"8campaign_stats.csv\")\n", - "campaign_stats.head()" - ] - }, - { - "cell_type": "markdown", - "id": "e6512bc9-91f5-4fe4-a637-a4e84dc497a9", - "metadata": {}, - "source": [ - "#### Look at links files" - ] - }, - { - "cell_type": "markdown", - "id": "28e7c1fe-470f-4d84-87b8-a711a973500b", - "metadata": {}, - "source": [ - "There is no links file for these company. Only the link_stats file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e973575b-4ed6-4b23-8024-f383ac82e87c", - "metadata": {}, - "outputs": [], - "source": [ - "links_stats = display_databases(\"8link_stats.csv\")\n", - "links_stats.head()" - ] - }, - { - "cell_type": "markdown", - "id": "8dfcca1f-1323-413f-aa8d-3ee5ce2610a8", - "metadata": {}, - "source": [ - "#### Analyse Customersplus file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b523575-c779-451c-a12e-a36fb4ad232c", - "metadata": {}, - "outputs": [], - "source": [ - "file_name = \"8customersplus.csv\"\n", - "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n", - "print(file_path)\n", - "with fs.open(file_path, mode=\"rb\") as file_in:\n", - " customersplus = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "customersplus.head()" - ] - }, - { - "cell_type": "markdown", - "id": "fe56785a-ed3c-4322-aafa-a630f97b836f", - "metadata": {}, - "source": [ - "#### Analyse Structures files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87d801fc-d19a-4c45-9b21-9b6d7a8451fd", - "metadata": {}, - "outputs": [], - "source": [ - "file_name = \"8structures.csv\"\n", - "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n", - "print(file_path)\n", - "try:\n", - " with fs.open(file_path, mode=\"rb\") as file_in:\n", - " structures = pd.read_csv(file_in, sep=\",\")\n", - "except:\n", - " print(\"No structures database\")" - ] - }, - { - "cell_type": "markdown", - "id": "b8452558-2d32-459b-91e7-f6042345e465", - "metadata": {}, - "source": [ - "For Stade Français, there is no structures, tags and structure_tag_mapping databases" - ] - }, - { - "cell_type": "markdown", - "id": "285b1422-9ca9-4afd-b752-777a54aaa677", - "metadata": {}, - "source": [ - "#### Analyze Target databases" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6e4c3ea-5ccf-4aec-bd2d-79a5a1194178", - "metadata": {}, - "outputs": [], - "source": [ - "file_name = \"8customer_target_mappings.csv\"\n", - "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n", - "print(file_path)\n", - "try:\n", - " with fs.open(file_path, mode=\"rb\") as file_in:\n", - " customer_targets = pd.read_csv(file_in, sep=\",\")\n", - " \n", - "except:\n", - " print(\"No such database in s3\")\n", - "\n", - "print(\"Shape : \", customer_targets.shape)\n", - "customer_targets.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e81a35c-3c6f-403d-9ebd-e8399ecd4263", - "metadata": {}, - "outputs": [], - "source": [ - "file_name = \"8targets.csv\"\n", - "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n", - "print(file_path)\n", - "try:\n", - " with fs.open(file_path, mode=\"rb\") as file_in:\n", - " targets = pd.read_csv(file_in, sep=\",\")\n", - " \n", - "except:\n", - " print(\"No such database in s3\")\n", - "\n", - "print(\"Shape : \", targets.shape)\n", - "targets.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "85696d74-3b2f-4368-9045-44db5322b60d", - "metadata": {}, - "outputs": [], - "source": [ - "file_name = \"8target_types.csv\"\n", - "file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n", - "print(file_path)\n", - "try:\n", - " with fs.open(file_path, mode=\"rb\") as file_in:\n", - " target_types = pd.read_csv(file_in, sep=\",\")\n", - " \n", - "except:\n", - " print(\"No such database in s3\")\n", - "\n", - "print(\"Shape : \", target_types.shape)\n", - "target_types.head()" - ] - }, - { - "cell_type": "markdown", - "id": "cdc6416b-3deb-446c-8957-435745b93533", - "metadata": {}, - "source": [ - "#### Analyze consumption files" - ] - }, - { - "cell_type": "markdown", - "id": "f8622bd5-a5ab-403f-ab01-758aec879ee4", - "metadata": {}, - "source": [ - "Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv\n", - "\n", - "However, there is no consumptions.csv file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c57529b-2ffb-4039-9795-b27c6fbd54a4", - "metadata": {}, - "outputs": [], - "source": [ - "purchases = display_databases(\"8purchases.csv\")\n", - "purchases.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "903321fb-99f8-475d-b4a6-c70ec2efe190", - "metadata": {}, - "outputs": [], - "source": [ - "tickets = display_databases(\"8tickets.csv\")\n", - "tickets.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "243e6942-0233-4cd5-b32b-e005457131d2", - "metadata": {}, - "outputs": [], - "source": [ - "suppliers = display_databases(\"8suppliers.csv\")\n", - "suppliers.head()" - ] - }, - { - "cell_type": "markdown", - "id": "fd8c876a-f0c5-4123-a422-c267af5f29b1", - "metadata": {}, - "source": [ - "#### Analyse product file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6b82efce-1dee-4d89-8585-28c4ad477eef", - "metadata": {}, - "outputs": [], - "source": [ - "products = display_databases(\"8products.csv\")\n", - "products.head()" - ] - }, - { - "cell_type": "markdown", - "id": "8ad143b2-2869-4bd2-982e-688498b98727", - "metadata": {}, - "source": [ - "#### Analyze pricing files" - ] - }, - { - "cell_type": "markdown", - "id": "9a54e9a5-801d-4000-9e76-e792edbf7e41", - "metadata": {}, - "source": [ - "Meaning pricing_formulas.csv and type_of_pricing_formulas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "daf37bff-a26d-4ff5-ad50-c90f917164bd", - "metadata": {}, - "outputs": [], - "source": [ - "pricing_formulas = display_databases(\"8pricing_formulas.csv\")\n", - "pricing_formulas.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdb14488-b093-4b39-84fa-1c2b4576208f", - "metadata": {}, - "outputs": [], - "source": [ - "type_pricing_formulas = display_databases(\"8type_of_pricing_formulas.csv\")\n", - "type_pricing_formulas.head()" - ] - }, - { - "cell_type": "markdown", - "id": "a084297a-4fd7-4cda-b513-7704f4244a5c", - "metadata": {}, - "source": [ - "#### Analyze type of products" - ] - }, - { - "cell_type": "markdown", - "id": "76a67ea7-8720-441e-8973-23e5d105370e", - "metadata": {}, - "source": [ - "Meaning categories.csv, type_of_categories.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6582694d-5339-4f33-a943-c73033121a90", - "metadata": {}, - "outputs": [], - "source": [ - "categories = display_databases(\"8categories.csv\")\n", - "categories.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "589076df-1958-42de-9941-1aff9fa8536f", - "metadata": {}, - "outputs": [], - "source": [ - "type_categories = display_databases(\"8type_of_categories.csv\")\n", - "type_categories.head()" - ] - }, - { - "cell_type": "markdown", - "id": "3427b681-4c05-4e4e-9c2b-867ee789f98c", - "metadata": {}, - "source": [ - "#### Analyze type of representations" - ] - }, - { - "cell_type": "markdown", - "id": "9381e36b-090a-44c5-a29d-3ac4c9a4431e", - "metadata": {}, - "source": [ - "Meaning representation_category_capacities.csv, representations.csv, representations_types.csv\n", - "\n", - "however there is no representation_types database" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f06d72a-5725-4eee-8e4c-e9ef5820f346", - "metadata": {}, - "outputs": [], - "source": [ - "representation_category_capacities = display_databases(\"8representation_category_capacities.csv\")\n", - "representation_category_capacities.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd405913-033d-4f15-a5b9-103d577baaff", - "metadata": {}, - "outputs": [], - "source": [ - "representations = display_databases(\"8representations.csv\")\n", - "representations.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f2c7ea3-6964-48fd-9411-17547b2c3a3f", - "metadata": {}, - "outputs": [], - "source": [ - "#representation_type = display_databases(\"8representation_types.csv\")" - ] - }, - { - "cell_type": "markdown", - "id": "a9b02406-2a69-4431-8d49-3c6bd6a5e1c7", - "metadata": {}, - "source": [ - "#### Analyze type of events" - ] - }, - { - "cell_type": "markdown", - "id": "1d554266-282c-4f64-9a0f-ddcf591ec912", - "metadata": {}, - "source": [ - "Meaning events.csv, event_types.csv, seasons.csv and facilities.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cba22ee2-338d-4ce1-a1e8-829a11a94bcf", - "metadata": {}, - "outputs": [], - "source": [ - "events = display_databases(\"8events.csv\")\n", - "events.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3db00b9d-2187-4cb6-980d-8ac6ab9eb460", - "metadata": {}, - "outputs": [], - "source": [ - "event_types = display_databases(\"8event_types.csv\")\n", - "event_types.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cba0ee58-6280-45fe-99b3-0be09db5922b", - "metadata": {}, - "outputs": [], - "source": [ - "seasons = display_databases(\"8seasons.csv\")\n", - "seasons.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6fa82fd7-d6d3-4857-af24-ea573b1129d0", - "metadata": {}, - "outputs": [], - "source": [ - "facilities = display_databases(\"8facilities.csv\")\n", - "facilities.head()" - ] - }, - { - "cell_type": "markdown", - "id": "c7467d41-0ded-465d-bb08-15be914a166b", - "metadata": {}, - "source": [ - "#### Analyze annexe databases" - ] - }, - { - "cell_type": "markdown", - "id": "17e9e334-0ae4-48d8-bed5-b50b4af49d5b", - "metadata": {}, - "source": [ - "Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc" - ] - }, - { - "cell_type": "markdown", - "id": "d3ec1040-48b2-40bb-8947-920ddb4589f3", - "metadata": {}, - "source": [ - "## II. Identify Commons Datasets" - ] - }, - { - "cell_type": "markdown", - "id": "ec528a8a-df38-48e2-a1be-4a1459a80a1e", - "metadata": {}, - "source": [ - "From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "c240b811-48a6-4501-9e70-bc51d69e3ac4", - "metadata": {}, - "outputs": [], - "source": [ - "## We first construct a dictionary reporting all the datasets for each companies\n", - "\n", - "companies = fs.ls(BUCKET)\n", - "companies_database = {}\n", - "\n", - "for company in companies:\n", - " companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "54057367-9df9-42f4-aa07-bf524bb76462", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of databases : 30\n" - ] - } - ], - "source": [ - "# Then we create a list of all database\n", - "\n", - "all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n", - "print(\"Number of databases : \",len(all_database))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "63914e20-9efc-4088-877b-edab5f225d00", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "30\n", - "23\n" - ] - } - ], - "source": [ - "## We then create a set of database in common for all companies\n", - "\n", - "data_in_common = set(all_database)\n", - "\n", - "print(len(data_in_common))\n", - "\n", - "for key in companies_database:\n", - " diff_database = data_in_common.symmetric_difference(companies_database[key])\n", - " data_in_common = data_in_common - diff_database\n", - "\n", - "print(len(data_in_common))\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "676d8536-7d8c-4075-a357-b8d06e501ca8", - "metadata": {}, - "source": [ - "## Create Universal database" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}