{ "cells": [ { "cell_type": "markdown", "id": "c4205b5d-e052-4863-a46b-20e4757052a7", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "ae3af8e6-ced8-4994-8877-fa98d4297cc0", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "dd3184e7-54a1-4463-af42-5850d9517a41", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 28, "id": "b6035982-9ff4-4013-9792-2d50e10db3d1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/1/1campaign_stats.csv',\n", " 'bdc2324-data/1/1campaigns.csv',\n", " 'bdc2324-data/1/1categories.csv',\n", " 'bdc2324-data/1/1countries.csv',\n", " 'bdc2324-data/1/1currencies.csv',\n", " 'bdc2324-data/1/1customer_target_mappings.csv',\n", " 'bdc2324-data/1/1customersplus.csv',\n", " 'bdc2324-data/1/1event_types.csv',\n", " 'bdc2324-data/1/1events.csv',\n", " 'bdc2324-data/1/1facilities.csv',\n", " 'bdc2324-data/1/1link_stats.csv',\n", " 'bdc2324-data/1/1pricing_formulas.csv',\n", " 'bdc2324-data/1/1product_packs.csv',\n", " 'bdc2324-data/1/1products.csv',\n", " 'bdc2324-data/1/1products_groups.csv',\n", " 'bdc2324-data/1/1purchases.csv',\n", " 'bdc2324-data/1/1representation_category_capacities.csv',\n", " 'bdc2324-data/1/1representations.csv',\n", " 'bdc2324-data/1/1seasons.csv',\n", " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", " 'bdc2324-data/1/1suppliers.csv',\n", " 'bdc2324-data/1/1tags.csv',\n", " 'bdc2324-data/1/1target_types.csv',\n", " 'bdc2324-data/1/1targets.csv',\n", " 'bdc2324-data/1/1tickets.csv',\n", " 'bdc2324-data/1/1type_of_categories.csv',\n", " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", " 'bdc2324-data/1/1type_ofs.csv']" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import s3fs\n", "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "BUCKET = \"bdc2324-data/1\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 7, "id": "b86c935d-124f-453f-80dd-83ea6770d09c", "metadata": {}, "outputs": [], "source": [ "dic_base=['campaign_stats','campaigns','categories','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets','type_of_categories','type_of_pricing_formulas','type_ofs']" ] }, { "cell_type": "code", "execution_count": 3, "id": "f6d0b27c-0ecd-406b-b042-6c3802dd68fd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425/1008972637.py:5: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "dic_base=['campaign_stats','campaigns','categories','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets','type_of_categories','type_of_pricing_formulas','type_ofs']\n", "for nom_base in dic_base:\n", " FILE_PATH_S3_fanta = 'bdc2324-data/1/1' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "2a6b5e22-3370-457f-83b7-dd1e13663229", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'bdc2324-data/1/1type_ofs.csv'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "FILE_PATH_S3_fanta" ] }, { "cell_type": "markdown", "id": "79012186-ea51-4252-843e-36a9bbe3847e", "metadata": {}, "source": [ "# Analyse exploratoire " ] }, { "cell_type": "markdown", "id": "1a365f29-4766-47d8-9796-24a5271867b2", "metadata": {}, "source": [ "## I. Base type_of_pricing_formulas" ] }, { "cell_type": "markdown", "id": "bcc14f93-2289-44eb-816b-a51049b258df", "metadata": {}, "source": [ "## Detection des valeur manquantes" ] }, { "cell_type": "raw", "id": "ab2ec4c4-9d38-4aeb-8202-9116df3cdd66", "metadata": {}, "source": [ "dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" ] }, { "cell_type": "markdown", "id": "88759b4a-2633-478d-abce-29abeac376d1", "metadata": {}, "source": [ "def verifier_donnees_manquantes(base):\n", " donnees_manquantes = base.isna().sum()\n", " print(\"Données manquantes pour la base :\")\n", " print(donnees_manquantes)" ] }, { "cell_type": "markdown", "id": "df3075b4-1490-4cf2-a3fe-c6d4e2144ae3", "metadata": {}, "source": [ "for nom_base in dic_prod_princing:\n", " verifier_donnees_manquantes(nom_base)" ] }, { "cell_type": "code", "execution_count": 6, "id": "e0c67c01-e837-4772-b070-d1be0d895a36", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0\n", "type_of_id 0\n", "pricing_formula_id 0\n", "created_at 0\n", "updated_at 0\n", "identifier 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan d\n", "\n", "type_of_pricing_formulas.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "83a6a48d-effe-4537-b4bb-d5a540b610f1", "metadata": {}, "outputs": [], "source": [ "#variable retenu:[[\"id\",\"type_of_id\",\"pricing_formula_id\"]]" ] }, { "cell_type": "code", "execution_count": 7, "id": "3eaffaa6-1164-4ee9-a671-8b5eb3df797d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtype_of_idpricing_formula_idcreated_atupdated_atidentifier
0111272021-01-05 11:55:51.226960+01:002021-01-05 11:55:51.226960+01:00cf2918b25e6dcf8c30798ca05c8ec8ed
12124252021-01-05 11:55:51.235606+01:002021-01-05 11:55:51.235606+01:002c8ee3f7c1487d792b6c946314e681f2
23129372021-01-05 11:55:51.240114+01:002021-01-05 11:55:51.240114+01:0044e55c85e4eb59b3c3c01c137a6b25fc
341482021-01-05 11:55:51.244638+01:002021-01-05 11:55:51.244638+01:00ee3bb93b7e2217cd86a49d547fedf6c6
45172021-01-05 11:55:51.249409+01:002021-01-05 11:55:51.249409+01:00ae701668574f1a653d2b21ddfd250620
.....................
563564466562022-02-18 16:15:58.872249+01:002022-02-18 16:15:58.872249+01:00f669824cdca9de9697f07ff3ba365a8d
564565466072022-02-18 16:15:59.231018+01:002022-02-18 16:15:59.231018+01:006421c8146a598758139153b0e7b921ea
565566467002022-02-18 16:15:59.724812+01:002022-02-18 16:15:59.724812+01:006823f6d4d80b322fbfb8b83545a9f96d
566567481182022-02-18 16:16:00.163381+01:002022-02-18 16:16:00.163381+01:0035cfc12584b4d1b94795d97fd0aa56e8
5675697481572023-03-13 11:30:29.480161+01:002023-03-13 11:30:29.480161+01:0055863541f33fd229ac9b54d9ec1f4874
\n", "

568 rows × 6 columns

\n", "
" ], "text/plain": [ " id type_of_id pricing_formula_id created_at \\\n", "0 1 1 127 2021-01-05 11:55:51.226960+01:00 \n", "1 2 1 2425 2021-01-05 11:55:51.235606+01:00 \n", "2 3 1 2937 2021-01-05 11:55:51.240114+01:00 \n", "3 4 1 48 2021-01-05 11:55:51.244638+01:00 \n", "4 5 1 7 2021-01-05 11:55:51.249409+01:00 \n", ".. ... ... ... ... \n", "563 564 4 6656 2022-02-18 16:15:58.872249+01:00 \n", "564 565 4 6607 2022-02-18 16:15:59.231018+01:00 \n", "565 566 4 6700 2022-02-18 16:15:59.724812+01:00 \n", "566 567 4 8118 2022-02-18 16:16:00.163381+01:00 \n", "567 569 7 48157 2023-03-13 11:30:29.480161+01:00 \n", "\n", " updated_at identifier \n", "0 2021-01-05 11:55:51.226960+01:00 cf2918b25e6dcf8c30798ca05c8ec8ed \n", "1 2021-01-05 11:55:51.235606+01:00 2c8ee3f7c1487d792b6c946314e681f2 \n", "2 2021-01-05 11:55:51.240114+01:00 44e55c85e4eb59b3c3c01c137a6b25fc \n", "3 2021-01-05 11:55:51.244638+01:00 ee3bb93b7e2217cd86a49d547fedf6c6 \n", "4 2021-01-05 11:55:51.249409+01:00 ae701668574f1a653d2b21ddfd250620 \n", ".. ... ... \n", "563 2022-02-18 16:15:58.872249+01:00 f669824cdca9de9697f07ff3ba365a8d \n", "564 2022-02-18 16:15:59.231018+01:00 6421c8146a598758139153b0e7b921ea \n", "565 2022-02-18 16:15:59.724812+01:00 6823f6d4d80b322fbfb8b83545a9f96d \n", "566 2022-02-18 16:16:00.163381+01:00 35cfc12584b4d1b94795d97fd0aa56e8 \n", "567 2023-03-13 11:30:29.480161+01:00 55863541f33fd229ac9b54d9ec1f4874 \n", "\n", "[568 rows x 6 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type_of_pricing_formulas" ] }, { "cell_type": "code", "execution_count": 8, "id": "57298669-8d55-40d5-a5aa-4c5df984eec7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "type_of_id int64\n", "pricing_formula_id int64\n", "created_at object\n", "updated_at object\n", "identifier object\n", "dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "type_of_pricing_formulas.dtypes" ] }, { "cell_type": "code", "execution_count": 9, "id": "c11850cb-8833-44c0-a11d-9695d620a42b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtype_of_idpricing_formula_idcreated_atupdated_atidentifier
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, type_of_id, pricing_formula_id, created_at, updated_at, identifier]\n", "Index: []" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Identification des doublons\n", "type_of_pricing_formulas.loc[type_of_pricing_formulas['id'].duplicated(keep=False),:]" ] }, { "cell_type": "markdown", "id": "7a40de03-5e18-4d3d-a0f8-da960c29fad8", "metadata": {}, "source": [ "## II.products_groups" ] }, { "cell_type": "code", "execution_count": 10, "id": "89909175-6734-4e8e-8632-d6f8ca812388", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0\n", "percent_price 0\n", "max_price 0\n", "min_price 0\n", "category_id 0\n", "pricing_formula_id 0\n", "representation_id 0\n", "created_at 0\n", "updated_at 0\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan \n", "\n", "products_groups.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "e0518684-c83c-4f0a-89ea-d7dcfd60051d", "metadata": {}, "outputs": [], "source": [ "#variable retenu:[[\"id\",\"percent_price\",\"max_price\",\"min_price\",\"category_id\",\"pricing_formula_id\",\"representation_id\"]]" ] }, { "cell_type": "code", "execution_count": 11, "id": "6a187170-96c4-48d2-9568-b270f67e2c27", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "percent_price float64\n", "max_price float64\n", "min_price float64\n", "category_id int64\n", "pricing_formula_id int64\n", "representation_id int64\n", "created_at object\n", "updated_at object\n", "dtype: object" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "products_groups.dtypes" ] }, { "cell_type": "code", "execution_count": 12, "id": "2fba2cb0-a6a4-43b2-a854-3be07939c28b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpercent_pricemax_pricemin_pricecategory_idpricing_formula_idrepresentation_idcreated_atupdated_at
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, percent_price, max_price, min_price, category_id, pricing_formula_id, representation_id, created_at, updated_at]\n", "Index: []" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Identification des doublons\n", "products_groups.loc[products_groups[['id','pricing_formula_id','representation_id']].duplicated(keep=False),:]" ] }, { "cell_type": "markdown", "id": "5312ac13-8fbd-4c3f-a98a-8c28f079a599", "metadata": {}, "source": [ "## III.pricing_formulas" ] }, { "cell_type": "code", "execution_count": 13, "id": "3383a773-0817-4b23-84e7-8d5d0c74b179", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atextra_fieldidentifier
041909visite mécènes 1h302022-07-08 07:08:26.802266+02:002022-07-08 07:08:26.802266+02:00NaN21d4b0043c12b21952b0797d140991a1
1502entree mucem tp( expo picasso)2020-09-03 13:43:59.816765+02:002022-02-18 15:57:55.792581+01:00NaN223b09e6c3f1f75dbf8df019af97a555
2504nombre de personnes cinema2020-09-03 13:43:59.818198+02:002021-01-25 19:16:05.187114+01:00NaNba33b7b6d225a75d713a356b49c4d915
3117spectacle tarif e famille tr2020-09-03 13:21:21.400249+02:002023-03-13 11:30:29.525335+01:00NaNa00b61ad933518856f86e63ca91a5750
41496billet nb famille mecene 1a2020-09-03 14:29:33.320952+02:002021-01-25 19:23:06.816402+01:00NaN7f6013803c242253a5ccde80f780984f
.....................
551529billet nb expo gr2020-09-03 13:43:59.835944+02:002022-02-18 15:57:55.792581+01:00NaN7d888e42abe101fc8b21dc88948c8b74
5523153nb pers visite scolaire rep2020-09-03 16:32:37.068864+02:002022-02-18 15:57:55.792581+01:00NaN3cf21731c25eee650d5b232ee4780563
5535847visite scolaire rep1h002021-06-09 18:10:49.742531+02:002022-02-18 15:55:03.576236+01:00NaNa7bb5a6892d55f0d5ee4ce5786ae5fc6
5545840france billet - entree ts2021-06-09 18:10:49.737576+02:002022-02-18 16:16:00.199543+01:00NaN4c53016fc65847646f600eff853593e5
5555863france billet - entree tp2021-06-09 18:12:49.269924+02:002022-02-18 16:16:00.199543+01:00NaN90e642c0e1ef6bc9f2bc43089798de00
\n", "

556 rows × 6 columns

\n", "
" ], "text/plain": [ " id name created_at \\\n", "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n", "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n", "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n", "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n", "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n", ".. ... ... ... \n", "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n", "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n", "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n", "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n", "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n", "\n", " updated_at extra_field \\\n", "0 2022-07-08 07:08:26.802266+02:00 NaN \n", "1 2022-02-18 15:57:55.792581+01:00 NaN \n", "2 2021-01-25 19:16:05.187114+01:00 NaN \n", "3 2023-03-13 11:30:29.525335+01:00 NaN \n", "4 2021-01-25 19:23:06.816402+01:00 NaN \n", ".. ... ... \n", "551 2022-02-18 15:57:55.792581+01:00 NaN \n", "552 2022-02-18 15:57:55.792581+01:00 NaN \n", "553 2022-02-18 15:55:03.576236+01:00 NaN \n", "554 2022-02-18 16:16:00.199543+01:00 NaN \n", "555 2022-02-18 16:16:00.199543+01:00 NaN \n", "\n", " identifier \n", "0 21d4b0043c12b21952b0797d140991a1 \n", "1 223b09e6c3f1f75dbf8df019af97a555 \n", "2 ba33b7b6d225a75d713a356b49c4d915 \n", "3 a00b61ad933518856f86e63ca91a5750 \n", "4 7f6013803c242253a5ccde80f780984f \n", ".. ... \n", "551 7d888e42abe101fc8b21dc88948c8b74 \n", "552 3cf21731c25eee650d5b232ee4780563 \n", "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n", "554 4c53016fc65847646f600eff853593e5 \n", "555 90e642c0e1ef6bc9f2bc43089798de00 \n", "\n", "[556 rows x 6 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pricing_formulas" ] }, { "cell_type": "code", "execution_count": 14, "id": "d8130c73-6c5f-45b1-93ae-db7679c8ca56", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0.0\n", "name 0.0\n", "created_at 0.0\n", "updated_at 0.0\n", "extra_field 1.0\n", "identifier 0.0\n", "dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan \n", "\n", "pricing_formulas.isna().sum()/pricing_formulas.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "9f2909c1-bc6a-443f-a077-84f6ce6b7ab5", "metadata": {}, "outputs": [], "source": [ "#variable retenu: [[\"id\",\"name\"]]" ] }, { "cell_type": "code", "execution_count": 15, "id": "44f1dbfd-c3cf-464b-9877-f37fcc61da92", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "name object\n", "created_at object\n", "updated_at object\n", "extra_field float64\n", "identifier object\n", "dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "pricing_formulas.dtypes" ] }, { "cell_type": "code", "execution_count": 16, "id": "6784b41b-da74-4fae-832e-16641ae710c1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atextra_fieldidentifier
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, name, created_at, updated_at, extra_field, identifier]\n", "Index: []" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Identification des doublons\n", "pricing_formulas.loc[pricing_formulas[['id']].duplicated(keep=False),:]" ] }, { "cell_type": "markdown", "id": "2145b0a4-b73d-4530-8c12-a78b1cf86eae", "metadata": {}, "source": [ "## IV. product_packs" ] }, { "cell_type": "code", "execution_count": 17, "id": "e36b07a7-4f0b-4711-86a0-12a1d8158eef", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0.0\n", "name 1.0\n", "type_of 0.0\n", "created_at 0.0\n", "updated_at 0.0\n", "identifier 0.0\n", "dtype: float64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan \n", "\n", "product_packs.isna().sum()/product_packs.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "e0887a01-51ea-4034-84fe-dc4dbf2ad949", "metadata": {}, "outputs": [], "source": [ "#variable retenu:[[\"id\",\"name\"]]" ] }, { "cell_type": "code", "execution_count": 18, "id": "8707396a-f86b-476d-a9f9-c39f8de1d02e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "name float64\n", "type_of int64\n", "created_at object\n", "updated_at object\n", "identifier object\n", "dtype: object" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "product_packs.dtypes" ] }, { "cell_type": "code", "execution_count": 19, "id": "4b102bd3-924b-43da-8915-be7664c23f97", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnametype_ofcreated_atupdated_atidentifier
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, name, type_of, created_at, updated_at, identifier]\n", "Index: []" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Identification des doublons\n", "product_packs.loc[product_packs[['id']].duplicated(keep=False),:]" ] }, { "cell_type": "markdown", "id": "cfe0c525-896b-4731-b38e-306ff6ea0c65", "metadata": {}, "source": [ "## V.products" ] }, { "cell_type": "code", "execution_count": 20, "id": "968beb24-f70c-4eb6-8b1e-4b04bc7fe9c9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0.0\n", "amount 0.0\n", "is_full_price 0.0\n", "representation_id 0.0\n", "pricing_formula_id 0.0\n", "created_at 0.0\n", "updated_at 0.0\n", "category_id 0.0\n", "apply_price 0.0\n", "products_group_id 0.0\n", "product_pack_id 0.0\n", "extra_field 1.0\n", "amount_consumption 1.0\n", "identifier 0.0\n", "dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan \n", "\n", "products.isna().sum()/products.shape[0]" ] }, { "cell_type": "code", "execution_count": 21, "id": "15bc6ac6-67e8-4e2c-9641-7ee8bb2581a3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "amount float64\n", "is_full_price bool\n", "representation_id int64\n", "pricing_formula_id int64\n", "created_at object\n", "updated_at object\n", "category_id int64\n", "apply_price float64\n", "products_group_id int64\n", "product_pack_id int64\n", "extra_field float64\n", "amount_consumption float64\n", "identifier object\n", "dtype: object" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "products.dtypes" ] }, { "cell_type": "code", "execution_count": 22, "id": "7daa4f1a-e429-4daf-a2e1-1e311b487e09", "metadata": {}, "outputs": [], "source": [ "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" ] }, { "cell_type": "code", "execution_count": 26, "id": "dc12b746-6708-4708-826a-acb5a8e665a1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atextra_fieldidentifier
041909visite mécènes 1h302022-07-08 07:08:26.802266+02:002022-07-08 07:08:26.802266+02:00NaN21d4b0043c12b21952b0797d140991a1
1502entree mucem tp( expo picasso)2020-09-03 13:43:59.816765+02:002022-02-18 15:57:55.792581+01:00NaN223b09e6c3f1f75dbf8df019af97a555
2504nombre de personnes cinema2020-09-03 13:43:59.818198+02:002021-01-25 19:16:05.187114+01:00NaNba33b7b6d225a75d713a356b49c4d915
3117spectacle tarif e famille tr2020-09-03 13:21:21.400249+02:002023-03-13 11:30:29.525335+01:00NaNa00b61ad933518856f86e63ca91a5750
41496billet nb famille mecene 1a2020-09-03 14:29:33.320952+02:002021-01-25 19:23:06.816402+01:00NaN7f6013803c242253a5ccde80f780984f
.....................
551529billet nb expo gr2020-09-03 13:43:59.835944+02:002022-02-18 15:57:55.792581+01:00NaN7d888e42abe101fc8b21dc88948c8b74
5523153nb pers visite scolaire rep2020-09-03 16:32:37.068864+02:002022-02-18 15:57:55.792581+01:00NaN3cf21731c25eee650d5b232ee4780563
5535847visite scolaire rep1h002021-06-09 18:10:49.742531+02:002022-02-18 15:55:03.576236+01:00NaNa7bb5a6892d55f0d5ee4ce5786ae5fc6
5545840france billet - entree ts2021-06-09 18:10:49.737576+02:002022-02-18 16:16:00.199543+01:00NaN4c53016fc65847646f600eff853593e5
5555863france billet - entree tp2021-06-09 18:12:49.269924+02:002022-02-18 16:16:00.199543+01:00NaN90e642c0e1ef6bc9f2bc43089798de00
\n", "

556 rows × 6 columns

\n", "
" ], "text/plain": [ " id name created_at \\\n", "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n", "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n", "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n", "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n", "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n", ".. ... ... ... \n", "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n", "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n", "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n", "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n", "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n", "\n", " updated_at extra_field \\\n", "0 2022-07-08 07:08:26.802266+02:00 NaN \n", "1 2022-02-18 15:57:55.792581+01:00 NaN \n", "2 2021-01-25 19:16:05.187114+01:00 NaN \n", "3 2023-03-13 11:30:29.525335+01:00 NaN \n", "4 2021-01-25 19:23:06.816402+01:00 NaN \n", ".. ... ... \n", "551 2022-02-18 15:57:55.792581+01:00 NaN \n", "552 2022-02-18 15:57:55.792581+01:00 NaN \n", "553 2022-02-18 15:55:03.576236+01:00 NaN \n", "554 2022-02-18 16:16:00.199543+01:00 NaN \n", "555 2022-02-18 16:16:00.199543+01:00 NaN \n", "\n", " identifier \n", "0 21d4b0043c12b21952b0797d140991a1 \n", "1 223b09e6c3f1f75dbf8df019af97a555 \n", "2 ba33b7b6d225a75d713a356b49c4d915 \n", "3 a00b61ad933518856f86e63ca91a5750 \n", "4 7f6013803c242253a5ccde80f780984f \n", ".. ... \n", "551 7d888e42abe101fc8b21dc88948c8b74 \n", "552 3cf21731c25eee650d5b232ee4780563 \n", "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n", "554 4c53016fc65847646f600eff853593e5 \n", "555 90e642c0e1ef6bc9f2bc43089798de00 \n", "\n", "[556 rows x 6 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pricing_formulas" ] }, { "cell_type": "markdown", "id": "46aad10f-8530-410e-872b-bb253c553a46", "metadata": {}, "source": [ "# jointure entre les bases" ] }, { "cell_type": "code", "execution_count": null, "id": "a4c3edd1-6d58-4c57-b3e4-0ef3529f6b8c", "metadata": {}, "outputs": [], "source": [ "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" ] }, { "cell_type": "code", "execution_count": 74, "id": "eac537e1-bbad-45bc-a85c-12b675da1088", "metadata": {}, "outputs": [], "source": [ "#Merge1 entre products et pricing_formulas\n", "base1=products.merge(pricing_formulas, how='left', left_on= 'pricing_formula_id', right_on= 'id', suffixes = (\"_products\", \"_pricing_formula\"))" ] }, { "cell_type": "code", "execution_count": 78, "id": "75be3a30-3114-432d-87d6-697533c3c871", "metadata": {}, "outputs": [], "source": [ "#Merge2 entre base1 et products_groups\n", "base2=base1.merge(products_groups, how='left', left_on= 'id_pricing_formula', right_on= 'id', suffixes = (\"_merge2\", \"_product_group\"))" ] }, { "cell_type": "code", "execution_count": 81, "id": "34a169c6-07a8-4ac3-a9e1-d7e7461f7310", "metadata": {}, "outputs": [], "source": [ "#Merge3 entre base2 et type_of_pricing_formulas\n", "base3=base2.merge(type_of_pricing_formulas, how='left', left_on= 'id_pricing_formula', right_on= 'pricing_formula_id', suffixes = (\"_merge3\", \"_type_of_pricing_f\"))" ] }, { "cell_type": "code", "execution_count": 89, "id": "f44f40d2-5304-4931-b7e6-fcc06b2657b6", "metadata": {}, "outputs": [], "source": [ "#Merge4 entre base3 et type_of_pricing_formulas\n", "df_product_pricing=base3.merge(product_packs, how='left', left_on= 'product_pack_id', right_on= 'id', suffixes = (\"_merge4\", \"_product_pack\"))" ] }, { "cell_type": "code", "execution_count": 90, "id": "a28772c3-7bc1-46b4-acc8-1388dc60ec98", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_productsamountis_full_pricerepresentation_id_merge2pricing_formula_id_merge2created_at_productsupdated_at_productscategory_id_merge2apply_priceproducts_group_id...pricing_formula_idcreated_at_type_of_pricing_fupdated_at_type_of_pricing_fidentifier_merge4idname_product_packtype_ofcreated_atupdated_atidentifier_product_pack
0106829.0False9141142020-09-03 14:09:43.119798+02:002020-09-03 14:09:43.119798+02:00410.010655...114.02021-02-15 17:02:27.395376+01:002021-02-15 17:02:27.395376+01:003706121eb9f43b635bef1433c06f679c1NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
14789.5False2731312020-09-03 13:21:22.711773+02:002020-09-03 13:21:22.711773+02:0010.0471...131.02021-02-05 11:52:05.923905+01:002021-02-05 11:52:05.923905+01:000aceb248607671792298436004b952751NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
22087311.5False2751372020-09-03 14:46:33.589030+02:002020-09-03 14:46:33.589030+02:0010.020825...137.02021-02-05 11:52:05.939898+01:002021-02-05 11:52:05.939898+01:0093002d4637331edd81ffc28b6e8e89c01NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
31571428.0False8251992022-01-28 19:29:23.525722+01:002022-01-28 19:29:23.525722+01:0050.0156773...9.02021-02-05 11:52:06.107939+01:002021-02-05 11:52:06.107939+01:007d0b25bdfff9f366da8be820608c81911NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
413418.5False9932020-09-03 13:29:30.773089+02:002020-09-03 13:29:30.773089+02:0010.01175...93.02021-02-05 11:52:06.004162+01:002021-02-05 11:52:06.004162+01:001dbb0795e8f47cb75ba7cdb08c06be5f1NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
\n", "

5 rows × 41 columns

\n", "
" ], "text/plain": [ " id_products amount is_full_price representation_id_merge2 \\\n", "0 10682 9.0 False 914 \n", "1 478 9.5 False 273 \n", "2 20873 11.5 False 275 \n", "3 157142 8.0 False 82519 \n", "4 1341 8.5 False 9 \n", "\n", " pricing_formula_id_merge2 created_at_products \\\n", "0 114 2020-09-03 14:09:43.119798+02:00 \n", "1 131 2020-09-03 13:21:22.711773+02:00 \n", "2 137 2020-09-03 14:46:33.589030+02:00 \n", "3 9 2022-01-28 19:29:23.525722+01:00 \n", "4 93 2020-09-03 13:29:30.773089+02:00 \n", "\n", " updated_at_products category_id_merge2 apply_price \\\n", "0 2020-09-03 14:09:43.119798+02:00 41 0.0 \n", "1 2020-09-03 13:21:22.711773+02:00 1 0.0 \n", "2 2020-09-03 14:46:33.589030+02:00 1 0.0 \n", "3 2022-01-28 19:29:23.525722+01:00 5 0.0 \n", "4 2020-09-03 13:29:30.773089+02:00 1 0.0 \n", "\n", " products_group_id ... pricing_formula_id \\\n", "0 10655 ... 114.0 \n", "1 471 ... 131.0 \n", "2 20825 ... 137.0 \n", "3 156773 ... 9.0 \n", "4 1175 ... 93.0 \n", "\n", " created_at_type_of_pricing_f updated_at_type_of_pricing_f \\\n", "0 2021-02-15 17:02:27.395376+01:00 2021-02-15 17:02:27.395376+01:00 \n", "1 2021-02-05 11:52:05.923905+01:00 2021-02-05 11:52:05.923905+01:00 \n", "2 2021-02-05 11:52:05.939898+01:00 2021-02-05 11:52:05.939898+01:00 \n", "3 2021-02-05 11:52:06.107939+01:00 2021-02-05 11:52:06.107939+01:00 \n", "4 2021-02-05 11:52:06.004162+01:00 2021-02-05 11:52:06.004162+01:00 \n", "\n", " identifier_merge4 id name_product_pack type_of \\\n", "0 3706121eb9f43b635bef1433c06f679c 1 NaN 0 \n", "1 0aceb248607671792298436004b95275 1 NaN 0 \n", "2 93002d4637331edd81ffc28b6e8e89c0 1 NaN 0 \n", "3 7d0b25bdfff9f366da8be820608c8191 1 NaN 0 \n", "4 1dbb0795e8f47cb75ba7cdb08c06be5f 1 NaN 0 \n", "\n", " created_at updated_at \\\n", "0 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "1 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "2 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "3 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "4 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "\n", " identifier_product_pack \n", "0 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "1 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "2 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "3 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "4 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "\n", "[5 rows x 41 columns]" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_product_pricing.head(5)" ] }, { "cell_type": "code", "execution_count": null, "id": "03442997-806f-4285-a139-3bad46bb4522", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "id": "d22a0d75-53c5-4b54-9060-c9e7c307fb13", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data\"\n", "directory_path = '2'" ] }, { "cell_type": "code", "execution_count": 6, "id": "7c229dad-6ebd-4f43-99f1-fb330dc29466", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/2/2campaign_stats.csv',\n", " 'bdc2324-data/2/2campaigns.csv',\n", " 'bdc2324-data/2/2categories.csv',\n", " 'bdc2324-data/2/2contribution_sites.csv',\n", " 'bdc2324-data/2/2contributions.csv',\n", " 'bdc2324-data/2/2countries.csv',\n", " 'bdc2324-data/2/2currencies.csv',\n", " 'bdc2324-data/2/2customer_target_mappings.csv',\n", " 'bdc2324-data/2/2customersplus.csv',\n", " 'bdc2324-data/2/2event_types.csv',\n", " 'bdc2324-data/2/2events.csv',\n", " 'bdc2324-data/2/2facilities.csv',\n", " 'bdc2324-data/2/2link_stats.csv',\n", " 'bdc2324-data/2/2pricing_formulas.csv',\n", " 'bdc2324-data/2/2product_packs.csv',\n", " 'bdc2324-data/2/2products.csv',\n", " 'bdc2324-data/2/2products_groups.csv',\n", " 'bdc2324-data/2/2purchases.csv',\n", " 'bdc2324-data/2/2representation_category_capacities.csv',\n", " 'bdc2324-data/2/2representations.csv',\n", " 'bdc2324-data/2/2seasons.csv',\n", " 'bdc2324-data/2/2structure_tag_mappings.csv',\n", " 'bdc2324-data/2/2suppliers.csv',\n", " 'bdc2324-data/2/2tags.csv',\n", " 'bdc2324-data/2/2target_types.csv',\n", " 'bdc2324-data/2/2targets.csv',\n", " 'bdc2324-data/2/2tickets.csv']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BUCKET = \"bdc2324-data/2\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 7, "id": "df3d3548-3d76-4f07-afa1-e240932bc1c7", "metadata": {}, "outputs": [], "source": [ "dic_base_ent2=['campaign_stats','campaigns','categories','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']" ] }, { "cell_type": "code", "execution_count": 8, "id": "90f8d5fc-43f3-4f36-b8cc-89a41785f032", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425/673681459.py:5: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "dic_base_ent2=['campaign_stats','campaigns','categories','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']\n", "for nom_base in dic_base_ent2:\n", " FILE_PATH_S3_fanta = 'bdc2324-data/2/2' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "3e39a584-e02b-41b2-831c-33b920e298e9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "27" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dic_base_ent2)" ] }, { "cell_type": "code", "execution_count": 10, "id": "2b6c6f65", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def calculer_proportion_valeurs_manquantes_et_exporter(databases, fichier_sortie='proportion_valeurs_manquantes.xlsx'):\n", " \"\"\"\n", " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données et exporter les résultats dans un fichier Excel.\n", "\n", " Paramètres:\n", " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", " - fichier_sortie (str): Le chemin du fichier Excel de sortie.\n", "\n", " Retourne:\n", " - Un fichier Excel où chaque onglet représente une base de données différente avec la proportion de valeurs manquantes pour chaque variable.\n", " \"\"\"\n", " with pd.ExcelWriter(fichier_sortie) as writer:\n", " for nom_db, df in databases.items():\n", " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", " proportion_manquantes = df.isnull().mean()\n", " # Convertir en DataFrame pour un meilleur affichage\n", " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", " # Écrire le DataFrame dans un onglet du fichier Excel\n", " resultats_df.to_excel(writer, sheet_name=nom_db)\n", "\n", " print(f\"Les résultats ont été exportés dans le fichier '{fichier_sortie}'.\")\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "06759646-9419-4841-b12f-bbfceb417f3a", "metadata": {}, "outputs": [], "source": [ "#fonction calcul la proportion de valeur manquante\n", "\n", "import pandas as pd\n", "\n", "def calculer_proportion_valeurs_manquantes(databases):\n", " \"\"\"\n", " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données.\n", "\n", " Paramètres:\n", " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", "\n", " Retourne:\n", " - Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont des DataFrames avec la proportion de valeurs manquantes pour chaque variable.\n", " \"\"\"\n", " resultats = {}\n", " for nom_db, df in databases.items():\n", " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", " proportion_manquantes = df.isnull().mean()\n", " # Convertir en DataFrame pour un meilleur affichage\n", " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", " # Ajouter le résultat au dictionnaire\n", " resultats[nom_db] = resultats_df\n", " return resultats" ] }, { "cell_type": "code", "execution_count": 12, "id": "0960daa8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Base de données: Base1\n", " ProportionValeursManquantes\n", "id 0.00%\n", "campaign_id 0.00%\n", "customer_id 0.00%\n", "opened_at 68.67%\n", "sent_at 0.00%\n", "delivered_at 1.61%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "\n", "Base de données: Base2\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 0.00%\n", "service_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "process_id 100.00%\n", "report_url 100.00%\n", "category 0.00%\n", "to_be_synced 0.00%\n", "identifier 0.00%\n", "sent_at 0.00%\n", "\n" ] } ], "source": [ "# Exemple d'utilisation\n", "\n", "databases = {'Base1': campaign_stats, 'Base2': campaigns}\n", "\n", "resultats = calculer_proportion_valeurs_manquantes(databases)\n", "\n", "for nom_db, resultat in resultats.items():\n", " print(f\"Base de données: {nom_db}\")\n", " print(resultat)\n", " print()\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "77dc02bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Base de données: campaign_stats\n", " ProportionValeursManquantes\n", "id 0.00%\n", "campaign_id 0.00%\n", "customer_id 0.00%\n", "opened_at 68.67%\n", "sent_at 0.00%\n", "delivered_at 1.61%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "\n", "Base de données: campaigns\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 0.00%\n", "service_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "process_id 100.00%\n", "report_url 100.00%\n", "category 0.00%\n", "to_be_synced 0.00%\n", "identifier 0.00%\n", "sent_at 0.00%\n", "\n", "Base de données: categories\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 100.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "extra_field 100.00%\n", "quota 100.00%\n", "identifier 0.00%\n", "\n", "Base de données: contribution_sites\n", " ProportionValeursManquantes\n", "id 0.00%\n", "identifier 0.00%\n", "facility_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "\n", "Base de données: contributions\n", " ProportionValeursManquantes\n", "id 0.00%\n", "sent_at 0.00%\n", "software 100.00%\n", "satisfaction 39.65%\n", "extra_field 100.00%\n", "customer_id 0.00%\n", "contribution_site_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: countries\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 1.63%\n", "code 0.41%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: currencies\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: customer_target_mappings\n", " ProportionValeursManquantes\n", "id 0.00%\n", "customer_id 0.00%\n", "target_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "name 100.00%\n", "extra_field 100.00%\n", "\n", "Base de données: customersplus\n", " ProportionValeursManquantes\n", "id 0.00%\n", "lastname 0.02%\n", "firstname 0.01%\n", "birthdate 96.75%\n", "email 1.05%\n", "street_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "civility 100.00%\n", "is_partner 0.00%\n", "extra 100.00%\n", "deleted_at 100.00%\n", "reference 100.00%\n", "gender 0.00%\n", "is_email_true 0.00%\n", "extra_field 100.00%\n", "identifier 0.00%\n", "opt_in 0.00%\n", "structure_id 97.57%\n", "note 97.84%\n", "profession 100.00%\n", "language 46.16%\n", "mcp_contact_id 100.00%\n", "need_reload 0.00%\n", "last_buying_date 12.58%\n", "max_price 12.58%\n", "ticket_sum 0.00%\n", "average_price 12.58%\n", "fidelity 0.00%\n", "average_purchase_delay 12.58%\n", "average_price_basket 12.58%\n", "average_ticket_basket 12.58%\n", "total_price 0.00%\n", "preferred_category 100.00%\n", "preferred_supplier 100.00%\n", "preferred_formula 100.00%\n", "purchase_count 0.00%\n", "first_buying_date 12.58%\n", "last_visiting_date 100.00%\n", "zipcode 98.80%\n", "country 97.64%\n", "age 96.75%\n", "tenant_id 0.00%\n", "\n", "Base de données: event_types\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "fidelity_delay 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: events\n", " ProportionValeursManquantes\n", "id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "season_id 0.00%\n", "facility_id 0.00%\n", "name 0.00%\n", "event_type_id 0.00%\n", "manual_added 0.00%\n", "is_display 0.00%\n", "event_type_key_id 0.00%\n", "facility_key_id 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: facilities\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 50.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "street_id 0.00%\n", "fixed_capacity 100.00%\n", "identifier 0.00%\n", "\n", "Base de données: link_stats\n", " ProportionValeursManquantes\n", "id 0.00%\n", "clicked_at 0.00%\n", "link_id 0.00%\n", "customer_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "\n", "Base de données: pricing_formulas\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "extra_field 100.00%\n", "identifier 0.00%\n", "\n", "Base de données: product_packs\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 100.00%\n", "type_of 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: products\n", " ProportionValeursManquantes\n", "id 0.00%\n", "amount 0.00%\n", "is_full_price 0.00%\n", "representation_id 0.00%\n", "pricing_formula_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "category_id 0.00%\n", "apply_price 0.00%\n", "products_group_id 0.00%\n", "product_pack_id 0.00%\n", "extra_field 100.00%\n", "amount_consumption 100.00%\n", "identifier 0.00%\n", "\n", "Base de données: products_groups\n", " ProportionValeursManquantes\n", "id 0.00%\n", "percent_price 0.00%\n", "max_price 0.00%\n", "min_price 0.00%\n", "category_id 0.00%\n", "pricing_formula_id 0.00%\n", "representation_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "\n", "Base de données: purchases\n", " ProportionValeursManquantes\n", "id 0.00%\n", "purchase_date 0.00%\n", "customer_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "number 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: representation_category_capacities\n", " ProportionValeursManquantes\n", "id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "representation_id 0.00%\n", "category_id 0.00%\n", "expected_filling 100.00%\n", "max_filling 100.00%\n", "\n", "Base de données: representations\n", " ProportionValeursManquantes\n", "id 0.00%\n", "serial 100.00%\n", "event_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "start_date_time 0.00%\n", "open 0.00%\n", "satisfaction 100.00%\n", "end_date_time 0.00%\n", "name 100.00%\n", "is_display 0.00%\n", "representation_type_id 100.00%\n", "expected_filling 100.00%\n", "max_filling 100.00%\n", "extra_field 100.00%\n", "identifier 0.00%\n", "\n", "Base de données: seasons\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "start_date_time 100.00%\n", "identifier 0.00%\n", "\n", "Base de données: structure_tag_mappings\n", " ProportionValeursManquantes\n", "id 0.00%\n", "structure_id 0.00%\n", "tag_id 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "\n", "Base de données: suppliers\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 20.00%\n", "manually_added 0.00%\n", "label 100.00%\n", "itr 100.00%\n", "updated_at 0.00%\n", "created_at 0.00%\n", "commission 100.00%\n", "identifier 0.00%\n", "\n", "Base de données: tags\n", " ProportionValeursManquantes\n", "id 0.00%\n", "name 50.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: target_types\n", " ProportionValeursManquantes\n", "id 0.00%\n", "is_import 25.00%\n", "name 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "identifier 0.00%\n", "\n", "Base de données: targets\n", " ProportionValeursManquantes\n", "id 0.00%\n", "target_type_id 0.00%\n", "name 5.26%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "\n", "Base de données: tickets\n", " ProportionValeursManquantes\n", "id 0.00%\n", "number 0.00%\n", "created_at 0.00%\n", "updated_at 0.00%\n", "purchase_id 0.00%\n", "product_id 0.00%\n", "is_from_subscription 0.00%\n", "type_of 0.00%\n", "supplier_id 0.00%\n", "barcode 100.00%\n", "identifier 0.00%\n", "\n" ] } ], "source": [ "# Exemple d'utilisation\n", "dict={'campaign_stats': campaign_stats,\n", " 'campaigns': campaigns,\n", " 'categories': categories,\n", " 'contribution_sites': contribution_sites,\n", " 'contributions': contributions,\n", " 'countries': countries,\n", " 'currencies': currencies,\n", " 'customer_target_mappings': customer_target_mappings,\n", " 'customersplus': customersplus,\n", " 'event_types': event_types,\n", " 'events': events,\n", " 'facilities': facilities,\n", " 'link_stats': link_stats,\n", " 'pricing_formulas': pricing_formulas,\n", " 'product_packs': product_packs,\n", " 'products': products,\n", " 'products_groups': products_groups,\n", " 'purchases': purchases,\n", " 'representation_category_capacities': representation_category_capacities,\n", " 'representations': representations,\n", " 'seasons': seasons,\n", " 'structure_tag_mappings': structure_tag_mappings,\n", " 'suppliers': suppliers,\n", " 'tags': tags,\n", " 'target_types': target_types,\n", " 'targets': targets,\n", " 'tickets': tickets}\n", "\n", "resultats = calculer_proportion_valeurs_manquantes(dict)\n", "\n", "for nom_db, resultat in resultats.items():\n", " print(f\"Base de données: {nom_db}\")\n", " print(resultat)\n", " print()" ] }, { "cell_type": "code", "execution_count": 14, "id": "60be9271", "metadata": {}, "outputs": [], "source": [ "#MEME CODE mais avec l'exportation de result a en format excel" ] }, { "cell_type": "code", "execution_count": null, "id": "955fe358", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def calculer_proportion_valeurs_manquantes_et_exporter(databases, fichier_sortie='proportion_valeurs_manquantes.xlsx'):\n", " \"\"\"\n", " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données et exporter les résultats dans un fichier Excel.\n", "\n", " Paramètres:\n", " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", " - fichier_sortie (str): Le chemin du fichier Excel de sortie.\n", "\n", " Retourne:\n", " - Un fichier Excel où chaque onglet représente une base de données différente avec la proportion de valeurs manquantes pour chaque variable.\n", " \"\"\"\n", " with pd.ExcelWriter(fichier_sortie) as writer:\n", " for nom_db, df in databases.items():\n", " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", " proportion_manquantes = df.isnull().mean()\n", " # Convertir en DataFrame pour un meilleur affichage\n", " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", " # Écrire le DataFrame dans un onglet du fichier Excel\n", " resultats_df.to_excel(writer, sheet_name=nom_db)\n", "\n", " print(f\"Les résultats ont été exportés dans le fichier '{fichier_sortie}'.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7897b689", "metadata": {}, "outputs": [], "source": [ "# Exemple d'utilisation\n", "dict={'campaign_stats': campaign_stats,\n", " 'campaigns': campaigns,\n", " 'categories': categories,\n", " 'contribution_sites': contribution_sites,\n", " 'contributions': contributions,\n", " 'countries': countries,\n", " 'currencies': currencies,\n", " 'customer_target_mappings': customer_target_mappings,\n", " 'customersplus': customersplus,\n", " 'event_types': event_types,\n", " 'events': events,\n", " 'facilities': facilities,\n", " 'link_stats': link_stats,\n", " 'pricing_formulas': pricing_formulas,\n", " 'product_packs': product_packs,\n", " 'products': products,\n", " 'products_groups': products_groups,\n", " 'purchases': purchases,\n", " 'representation_category_capacities': representation_category_capacities,\n", " 'representations': representations,\n", " 'seasons': seasons,\n", " 'structure_tag_mappings': structure_tag_mappings,\n", " 'suppliers': suppliers,\n", " 'tags': tags,\n", " 'target_types': target_types,\n", " 'targets': targets,\n", " 'tickets': tickets}\n", "\n", "calculer_proportion_valeurs_manquantes_et_exporter(dict, 'proportion_valeurs_manquantes_ent1.xlsx')\n" ] }, { "cell_type": "markdown", "id": "514273f4", "metadata": {}, "source": [ "## Entreprise 3" ] }, { "cell_type": "code", "execution_count": 29, "id": "69b8f59a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/3/3campaign_stats.csv',\n", " 'bdc2324-data/3/3campaigns.csv',\n", " 'bdc2324-data/3/3categories.csv',\n", " 'bdc2324-data/3/3consumptions.csv',\n", " 'bdc2324-data/3/3contribution_sites.csv',\n", " 'bdc2324-data/3/3contributions.csv',\n", " 'bdc2324-data/3/3countries.csv',\n", " 'bdc2324-data/3/3currencies.csv',\n", " 'bdc2324-data/3/3customer_target_mappings.csv',\n", " 'bdc2324-data/3/3customersplus.csv',\n", " 'bdc2324-data/3/3event_types.csv',\n", " 'bdc2324-data/3/3events.csv',\n", " 'bdc2324-data/3/3facilities.csv',\n", " 'bdc2324-data/3/3link_stats.csv',\n", " 'bdc2324-data/3/3pricing_formulas.csv',\n", " 'bdc2324-data/3/3product_packs.csv',\n", " 'bdc2324-data/3/3products.csv',\n", " 'bdc2324-data/3/3products_groups.csv',\n", " 'bdc2324-data/3/3purchases.csv',\n", " 'bdc2324-data/3/3representation_category_capacities.csv',\n", " 'bdc2324-data/3/3representations.csv',\n", " 'bdc2324-data/3/3seasons.csv',\n", " 'bdc2324-data/3/3structure_tag_mappings.csv',\n", " 'bdc2324-data/3/3suppliers.csv',\n", " 'bdc2324-data/3/3tags.csv',\n", " 'bdc2324-data/3/3target_types.csv',\n", " 'bdc2324-data/3/3targets.csv',\n", " 'bdc2324-data/3/3tickets.csv']" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BUCKET = \"bdc2324-data/3\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": null, "id": "8a842d0b-f341-4752-b624-3a339ef0fe1e", "metadata": {}, "outputs": [], "source": [ "# Chargement des données temporaires\n", "BUCKET = \"projet-bdc2324-team1\"\n", "FILE_KEY_S3 = \"0_Temp/Company 1 - Purchasing behaviour.csv\"\n", "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " tickets_kpi = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 17, "id": "9b4c005f", "metadata": {}, "outputs": [], "source": [ "dic_base_ent3=['campaign_stats','campaigns','categories','consumptions','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']" ] }, { "cell_type": "code", "execution_count": 20, "id": "aae542d6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_425/4241072101.py:5: DtypeWarning: Columns (19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n", "/tmp/ipykernel_425/4241072101.py:5: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "dic_base_ent3=['campaign_stats','campaigns','categories','consumptions','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']\n", "for nom_base in dic_base_ent2:\n", " FILE_PATH_S3_fanta = 'bdc2324-data/3/3' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 21, "id": "907c650c-df7e-4e5c-b3cb-6595be061e99", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atfidelity_delayidentifier
060873journees des plantes2022-09-13 17:42:18.040557+02:002022-09-13 17:42:18.040557+02:003658568a64f69dd864539e7a682b03ef3a
160876parking2022-09-13 17:42:18.043821+02:002022-09-13 17:42:18.043821+02:00363ac156eead4ae6b40e9c498d532b4448
260997pass arc2022-09-13 18:04:38.812389+02:002022-09-13 18:04:38.812389+02:0036ddffce8b0a072d76a766097b34208482
361233paris pass museum2022-09-13 18:53:15.878739+02:002022-09-13 18:53:15.878739+02:0036394a376e43e498dccf8a448004f2aa84
460911spectacle noel2022-09-13 17:48:50.549760+02:002022-09-13 17:48:50.549760+02:0036a2b0f7c330d4c8d0338e6edf5ce4c81a
561007domaine + spect noel2022-09-13 18:07:25.121513+02:002022-09-13 18:07:25.121513+02:0036c65e9028b505dcfa71acef4e15f6e7be
662374patrivia2022-09-14 02:08:56.789118+02:002022-09-14 02:08:56.789118+02:00368db1f5c8774a8cf07542249278d7d778
772615NaN2023-08-14 06:20:26.491399+02:002023-08-14 06:20:26.491399+02:0036d41d8cd98f00b204e9800998ecf8427e
861011prestation annexe2022-09-13 18:07:25.126517+02:002022-09-13 18:07:25.126517+02:00362588ceebb05d3329f334687b8647887e
960877minibus2022-09-13 17:42:18.045141+02:002022-09-13 17:42:18.045141+02:0036397361c9c0dc82d911aa931223bd7a4e
1061708location espace2022-09-13 21:52:57.785694+02:002022-09-13 21:52:57.785694+02:00363738beebf604a960e016ffad6db1df74
1160931exposition2022-09-13 17:52:36.164774+02:002022-09-13 17:52:36.164774+02:003669033f19d53294c467d0fb7d3a4ad868
1260871domaine2022-09-13 17:42:18.037831+02:002022-09-13 17:42:18.037831+02:0036b81285860b791e63dee94559f0a9e8e4
1360878visite guidee2022-09-13 17:42:18.046593+02:002022-09-13 17:42:18.046593+02:00363474ed518d4fa7b86719680e70039ac2
1460875parc2022-09-13 17:42:18.042824+02:002022-09-13 17:42:18.042824+02:003641c16554f8160b3bd2a6b3809f309e27
1560870spectacle saison2022-09-13 17:42:18.028836+02:002022-09-13 17:42:18.028836+02:00367afdec36b66f94b67e813ac7d092ea0c
1660872domaine + spect saison2022-09-13 17:42:18.039515+02:002022-09-13 17:42:18.039515+02:0036ccd5dffaa070e9c7885ff3e9149f12f0
1760874supplement spectacle2022-09-13 17:42:18.041862+02:002022-09-13 17:42:18.041862+02:0036093d3e6b14b1ad33908a18522d02886b
1861203pique nique en blanc2022-09-13 18:44:16.813045+02:002022-09-13 18:44:16.813045+02:0036ff9a87979d4a564a1d2a1055a1aa186e
1961427restauration2022-09-13 19:26:41.906836+02:002022-09-13 19:26:41.906836+02:0036f1bcd494fa3171bf042e62c311157547
2060879animation culturelle2022-09-13 17:42:18.047567+02:002022-09-13 17:42:18.047567+02:003627b0b018dfa7301abffcb243a876e4c4
2161270ecurie2022-09-13 18:57:52.734356+02:002022-09-13 18:57:52.734356+02:0036a21e22a0d924104179b25069de927909
2260929domaine + spect ete2022-09-13 17:52:36.162726+02:002022-09-13 17:52:36.162726+02:00368bdc1d1d5fba2317af8c4d733d8206d7
2360910spectacle ete2022-09-13 17:48:50.548826+02:002022-09-13 17:48:50.548826+02:003644878d1fd6c7fe384274861294c59017
\n", "
" ], "text/plain": [ " id name created_at \\\n", "0 60873 journees des plantes 2022-09-13 17:42:18.040557+02:00 \n", "1 60876 parking 2022-09-13 17:42:18.043821+02:00 \n", "2 60997 pass arc 2022-09-13 18:04:38.812389+02:00 \n", "3 61233 paris pass museum 2022-09-13 18:53:15.878739+02:00 \n", "4 60911 spectacle noel 2022-09-13 17:48:50.549760+02:00 \n", "5 61007 domaine + spect noel 2022-09-13 18:07:25.121513+02:00 \n", "6 62374 patrivia 2022-09-14 02:08:56.789118+02:00 \n", "7 72615 NaN 2023-08-14 06:20:26.491399+02:00 \n", "8 61011 prestation annexe 2022-09-13 18:07:25.126517+02:00 \n", "9 60877 minibus 2022-09-13 17:42:18.045141+02:00 \n", "10 61708 location espace 2022-09-13 21:52:57.785694+02:00 \n", "11 60931 exposition 2022-09-13 17:52:36.164774+02:00 \n", "12 60871 domaine 2022-09-13 17:42:18.037831+02:00 \n", "13 60878 visite guidee 2022-09-13 17:42:18.046593+02:00 \n", "14 60875 parc 2022-09-13 17:42:18.042824+02:00 \n", "15 60870 spectacle saison 2022-09-13 17:42:18.028836+02:00 \n", "16 60872 domaine + spect saison 2022-09-13 17:42:18.039515+02:00 \n", "17 60874 supplement spectacle 2022-09-13 17:42:18.041862+02:00 \n", "18 61203 pique nique en blanc 2022-09-13 18:44:16.813045+02:00 \n", "19 61427 restauration 2022-09-13 19:26:41.906836+02:00 \n", "20 60879 animation culturelle 2022-09-13 17:42:18.047567+02:00 \n", "21 61270 ecurie 2022-09-13 18:57:52.734356+02:00 \n", "22 60929 domaine + spect ete 2022-09-13 17:52:36.162726+02:00 \n", "23 60910 spectacle ete 2022-09-13 17:48:50.548826+02:00 \n", "\n", " updated_at fidelity_delay \\\n", "0 2022-09-13 17:42:18.040557+02:00 36 \n", "1 2022-09-13 17:42:18.043821+02:00 36 \n", "2 2022-09-13 18:04:38.812389+02:00 36 \n", "3 2022-09-13 18:53:15.878739+02:00 36 \n", "4 2022-09-13 17:48:50.549760+02:00 36 \n", "5 2022-09-13 18:07:25.121513+02:00 36 \n", "6 2022-09-14 02:08:56.789118+02:00 36 \n", "7 2023-08-14 06:20:26.491399+02:00 36 \n", "8 2022-09-13 18:07:25.126517+02:00 36 \n", "9 2022-09-13 17:42:18.045141+02:00 36 \n", "10 2022-09-13 21:52:57.785694+02:00 36 \n", "11 2022-09-13 17:52:36.164774+02:00 36 \n", "12 2022-09-13 17:42:18.037831+02:00 36 \n", "13 2022-09-13 17:42:18.046593+02:00 36 \n", "14 2022-09-13 17:42:18.042824+02:00 36 \n", "15 2022-09-13 17:42:18.028836+02:00 36 \n", "16 2022-09-13 17:42:18.039515+02:00 36 \n", "17 2022-09-13 17:42:18.041862+02:00 36 \n", "18 2022-09-13 18:44:16.813045+02:00 36 \n", "19 2022-09-13 19:26:41.906836+02:00 36 \n", "20 2022-09-13 17:42:18.047567+02:00 36 \n", "21 2022-09-13 18:57:52.734356+02:00 36 \n", "22 2022-09-13 17:52:36.162726+02:00 36 \n", "23 2022-09-13 17:48:50.548826+02:00 36 \n", "\n", " identifier \n", "0 58568a64f69dd864539e7a682b03ef3a \n", "1 3ac156eead4ae6b40e9c498d532b4448 \n", "2 ddffce8b0a072d76a766097b34208482 \n", "3 394a376e43e498dccf8a448004f2aa84 \n", "4 a2b0f7c330d4c8d0338e6edf5ce4c81a \n", "5 c65e9028b505dcfa71acef4e15f6e7be \n", "6 8db1f5c8774a8cf07542249278d7d778 \n", "7 d41d8cd98f00b204e9800998ecf8427e \n", "8 2588ceebb05d3329f334687b8647887e \n", "9 397361c9c0dc82d911aa931223bd7a4e \n", "10 3738beebf604a960e016ffad6db1df74 \n", "11 69033f19d53294c467d0fb7d3a4ad868 \n", "12 b81285860b791e63dee94559f0a9e8e4 \n", "13 3474ed518d4fa7b86719680e70039ac2 \n", "14 41c16554f8160b3bd2a6b3809f309e27 \n", "15 7afdec36b66f94b67e813ac7d092ea0c \n", "16 ccd5dffaa070e9c7885ff3e9149f12f0 \n", "17 093d3e6b14b1ad33908a18522d02886b \n", "18 ff9a87979d4a564a1d2a1055a1aa186e \n", "19 f1bcd494fa3171bf042e62c311157547 \n", "20 27b0b018dfa7301abffcb243a876e4c4 \n", "21 a21e22a0d924104179b25069de927909 \n", "22 8bdc1d1d5fba2317af8c4d733d8206d7 \n", "23 44878d1fd6c7fe384274861294c59017 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "event_types" ] }, { "cell_type": "code", "execution_count": 27, "id": "617f6e54-c2a4-4627-9b94-9644a4dace71", "metadata": {}, "outputs": [], "source": [ "entreprise_base=['bdc2324-data/1', 'bdc2324-data/2', 'bdc2324-data/3', 'bdc2324-data/4', 'bdc2324-data/5', 'bdc2324-data/6', 'bdc2324-data/7', 'bdc2324-data/8','bdc2324-data/9','bdc2324-data/10','bdc2324-data/11','bdc2324-data/12','bdc2324-data/13','bdc2324-data/14','bdc2324-data/101']" ] }, { "cell_type": "code", "execution_count": null, "id": "b4ce5f9e-7358-4c09-871b-45989f142e25", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7a0b081c-e0ad-49a8-af25-78b2dc3cdcb8", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }