{ "cells": [ { "cell_type": "markdown", "id": "c4205b5d-e052-4863-a46b-20e4757052a7", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "ae3af8e6-ced8-4994-8877-fa98d4297cc0", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "dd3184e7-54a1-4463-af42-5850d9517a41", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 6, "id": "b6035982-9ff4-4013-9792-2d50e10db3d1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/1/1campaign_stats.csv',\n", " 'bdc2324-data/1/1campaigns.csv',\n", " 'bdc2324-data/1/1categories.csv',\n", " 'bdc2324-data/1/1countries.csv',\n", " 'bdc2324-data/1/1currencies.csv',\n", " 'bdc2324-data/1/1customer_target_mappings.csv',\n", " 'bdc2324-data/1/1customersplus.csv',\n", " 'bdc2324-data/1/1event_types.csv',\n", " 'bdc2324-data/1/1events.csv',\n", " 'bdc2324-data/1/1facilities.csv',\n", " 'bdc2324-data/1/1link_stats.csv',\n", " 'bdc2324-data/1/1pricing_formulas.csv',\n", " 'bdc2324-data/1/1product_packs.csv',\n", " 'bdc2324-data/1/1products.csv',\n", " 'bdc2324-data/1/1products_groups.csv',\n", " 'bdc2324-data/1/1purchases.csv',\n", " 'bdc2324-data/1/1representation_category_capacities.csv',\n", " 'bdc2324-data/1/1representations.csv',\n", " 'bdc2324-data/1/1seasons.csv',\n", " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", " 'bdc2324-data/1/1suppliers.csv',\n", " 'bdc2324-data/1/1tags.csv',\n", " 'bdc2324-data/1/1target_types.csv',\n", " 'bdc2324-data/1/1targets.csv',\n", " 'bdc2324-data/1/1tickets.csv',\n", " 'bdc2324-data/1/1type_of_categories.csv',\n", " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", " 'bdc2324-data/1/1type_ofs.csv']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import s3fs\n", "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "BUCKET = \"bdc2324-data/1\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 7, "id": "b86c935d-124f-453f-80dd-83ea6770d09c", "metadata": {}, "outputs": [], "source": [ "dic_base=['campaign_stats','campaigns','categories','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets','type_of_categories','type_of_pricing_formulas','type_ofs']" ] }, { "cell_type": "code", "execution_count": 8, "id": "f6d0b27c-0ecd-406b-b042-6c3802dd68fd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_438/1008972637.py:5: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "dic_base=['campaign_stats','campaigns','categories','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets','type_of_categories','type_of_pricing_formulas','type_ofs']\n", "for nom_base in dic_base:\n", " FILE_PATH_S3_fanta = 'bdc2324-data/1/1' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "2a6b5e22-3370-457f-83b7-dd1e13663229", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'bdc2324-data/1/1type_ofs.csv'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "FILE_PATH_S3_fanta" ] }, { "cell_type": "markdown", "id": "79012186-ea51-4252-843e-36a9bbe3847e", "metadata": {}, "source": [ "# Analyse exploratoire " ] }, { "cell_type": "markdown", "id": "1a365f29-4766-47d8-9796-24a5271867b2", "metadata": {}, "source": [ "## I. Base type_of_pricing_formulas" ] }, { "cell_type": "markdown", "id": "bcc14f93-2289-44eb-816b-a51049b258df", "metadata": {}, "source": [ "## Detection des valeur manquantes" ] }, { "cell_type": "raw", "id": "ab2ec4c4-9d38-4aeb-8202-9116df3cdd66", "metadata": {}, "source": [ "dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" ] }, { "cell_type": "markdown", "id": "88759b4a-2633-478d-abce-29abeac376d1", "metadata": {}, "source": [ "def verifier_donnees_manquantes(base):\n", " donnees_manquantes = base.isna().sum()\n", " print(\"Données manquantes pour la base :\")\n", " print(donnees_manquantes)" ] }, { "cell_type": "markdown", "id": "df3075b4-1490-4cf2-a3fe-c6d4e2144ae3", "metadata": {}, "source": [ "for nom_base in dic_prod_princing:\n", " verifier_donnees_manquantes(nom_base)" ] }, { "cell_type": "code", "execution_count": 6, "id": "e0c67c01-e837-4772-b070-d1be0d895a36", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0\n", "type_of_id 0\n", "pricing_formula_id 0\n", "created_at 0\n", "updated_at 0\n", "identifier 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan d\n", "\n", "type_of_pricing_formulas.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "83a6a48d-effe-4537-b4bb-d5a540b610f1", "metadata": {}, "outputs": [], "source": [ "#variable retenu:[[\"id\",\"type_of_id\",\"pricing_formula_id\"]]" ] }, { "cell_type": "code", "execution_count": 7, "id": "3eaffaa6-1164-4ee9-a671-8b5eb3df797d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtype_of_idpricing_formula_idcreated_atupdated_atidentifier
0111272021-01-05 11:55:51.226960+01:002021-01-05 11:55:51.226960+01:00cf2918b25e6dcf8c30798ca05c8ec8ed
12124252021-01-05 11:55:51.235606+01:002021-01-05 11:55:51.235606+01:002c8ee3f7c1487d792b6c946314e681f2
23129372021-01-05 11:55:51.240114+01:002021-01-05 11:55:51.240114+01:0044e55c85e4eb59b3c3c01c137a6b25fc
341482021-01-05 11:55:51.244638+01:002021-01-05 11:55:51.244638+01:00ee3bb93b7e2217cd86a49d547fedf6c6
45172021-01-05 11:55:51.249409+01:002021-01-05 11:55:51.249409+01:00ae701668574f1a653d2b21ddfd250620
.....................
563564466562022-02-18 16:15:58.872249+01:002022-02-18 16:15:58.872249+01:00f669824cdca9de9697f07ff3ba365a8d
564565466072022-02-18 16:15:59.231018+01:002022-02-18 16:15:59.231018+01:006421c8146a598758139153b0e7b921ea
565566467002022-02-18 16:15:59.724812+01:002022-02-18 16:15:59.724812+01:006823f6d4d80b322fbfb8b83545a9f96d
566567481182022-02-18 16:16:00.163381+01:002022-02-18 16:16:00.163381+01:0035cfc12584b4d1b94795d97fd0aa56e8
5675697481572023-03-13 11:30:29.480161+01:002023-03-13 11:30:29.480161+01:0055863541f33fd229ac9b54d9ec1f4874
\n", "

568 rows × 6 columns

\n", "
" ], "text/plain": [ " id type_of_id pricing_formula_id created_at \\\n", "0 1 1 127 2021-01-05 11:55:51.226960+01:00 \n", "1 2 1 2425 2021-01-05 11:55:51.235606+01:00 \n", "2 3 1 2937 2021-01-05 11:55:51.240114+01:00 \n", "3 4 1 48 2021-01-05 11:55:51.244638+01:00 \n", "4 5 1 7 2021-01-05 11:55:51.249409+01:00 \n", ".. ... ... ... ... \n", "563 564 4 6656 2022-02-18 16:15:58.872249+01:00 \n", "564 565 4 6607 2022-02-18 16:15:59.231018+01:00 \n", "565 566 4 6700 2022-02-18 16:15:59.724812+01:00 \n", "566 567 4 8118 2022-02-18 16:16:00.163381+01:00 \n", "567 569 7 48157 2023-03-13 11:30:29.480161+01:00 \n", "\n", " updated_at identifier \n", "0 2021-01-05 11:55:51.226960+01:00 cf2918b25e6dcf8c30798ca05c8ec8ed \n", "1 2021-01-05 11:55:51.235606+01:00 2c8ee3f7c1487d792b6c946314e681f2 \n", "2 2021-01-05 11:55:51.240114+01:00 44e55c85e4eb59b3c3c01c137a6b25fc \n", "3 2021-01-05 11:55:51.244638+01:00 ee3bb93b7e2217cd86a49d547fedf6c6 \n", "4 2021-01-05 11:55:51.249409+01:00 ae701668574f1a653d2b21ddfd250620 \n", ".. ... ... \n", "563 2022-02-18 16:15:58.872249+01:00 f669824cdca9de9697f07ff3ba365a8d \n", "564 2022-02-18 16:15:59.231018+01:00 6421c8146a598758139153b0e7b921ea \n", "565 2022-02-18 16:15:59.724812+01:00 6823f6d4d80b322fbfb8b83545a9f96d \n", "566 2022-02-18 16:16:00.163381+01:00 35cfc12584b4d1b94795d97fd0aa56e8 \n", "567 2023-03-13 11:30:29.480161+01:00 55863541f33fd229ac9b54d9ec1f4874 \n", "\n", "[568 rows x 6 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type_of_pricing_formulas" ] }, { "cell_type": "code", "execution_count": 8, "id": "57298669-8d55-40d5-a5aa-4c5df984eec7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "type_of_id int64\n", "pricing_formula_id int64\n", "created_at object\n", "updated_at object\n", "identifier object\n", "dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "type_of_pricing_formulas.dtypes" ] }, { "cell_type": "code", "execution_count": 9, "id": "c11850cb-8833-44c0-a11d-9695d620a42b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtype_of_idpricing_formula_idcreated_atupdated_atidentifier
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, type_of_id, pricing_formula_id, created_at, updated_at, identifier]\n", "Index: []" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Identification des doublons\n", "type_of_pricing_formulas.loc[type_of_pricing_formulas['id'].duplicated(keep=False),:]" ] }, { "cell_type": "markdown", "id": "7a40de03-5e18-4d3d-a0f8-da960c29fad8", "metadata": {}, "source": [ "## II.products_groups" ] }, { "cell_type": "code", "execution_count": 10, "id": "89909175-6734-4e8e-8632-d6f8ca812388", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0\n", "percent_price 0\n", "max_price 0\n", "min_price 0\n", "category_id 0\n", "pricing_formula_id 0\n", "representation_id 0\n", "created_at 0\n", "updated_at 0\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan \n", "\n", "products_groups.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "e0518684-c83c-4f0a-89ea-d7dcfd60051d", "metadata": {}, "outputs": [], "source": [ "#variable retenu:[[\"id\",\"percent_price\",\"max_price\",\"min_price\",\"category_id\",\"pricing_formula_id\",\"representation_id\"]]" ] }, { "cell_type": "code", "execution_count": 11, "id": "6a187170-96c4-48d2-9568-b270f67e2c27", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "percent_price float64\n", "max_price float64\n", "min_price float64\n", "category_id int64\n", "pricing_formula_id int64\n", "representation_id int64\n", "created_at object\n", "updated_at object\n", "dtype: object" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "products_groups.dtypes" ] }, { "cell_type": "code", "execution_count": 12, "id": "2fba2cb0-a6a4-43b2-a854-3be07939c28b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpercent_pricemax_pricemin_pricecategory_idpricing_formula_idrepresentation_idcreated_atupdated_at
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, percent_price, max_price, min_price, category_id, pricing_formula_id, representation_id, created_at, updated_at]\n", "Index: []" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Identification des doublons\n", "products_groups.loc[products_groups[['id','pricing_formula_id','representation_id']].duplicated(keep=False),:]" ] }, { "cell_type": "markdown", "id": "5312ac13-8fbd-4c3f-a98a-8c28f079a599", "metadata": {}, "source": [ "## III.pricing_formulas" ] }, { "cell_type": "code", "execution_count": 13, "id": "3383a773-0817-4b23-84e7-8d5d0c74b179", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atextra_fieldidentifier
041909visite mécènes 1h302022-07-08 07:08:26.802266+02:002022-07-08 07:08:26.802266+02:00NaN21d4b0043c12b21952b0797d140991a1
1502entree mucem tp( expo picasso)2020-09-03 13:43:59.816765+02:002022-02-18 15:57:55.792581+01:00NaN223b09e6c3f1f75dbf8df019af97a555
2504nombre de personnes cinema2020-09-03 13:43:59.818198+02:002021-01-25 19:16:05.187114+01:00NaNba33b7b6d225a75d713a356b49c4d915
3117spectacle tarif e famille tr2020-09-03 13:21:21.400249+02:002023-03-13 11:30:29.525335+01:00NaNa00b61ad933518856f86e63ca91a5750
41496billet nb famille mecene 1a2020-09-03 14:29:33.320952+02:002021-01-25 19:23:06.816402+01:00NaN7f6013803c242253a5ccde80f780984f
.....................
551529billet nb expo gr2020-09-03 13:43:59.835944+02:002022-02-18 15:57:55.792581+01:00NaN7d888e42abe101fc8b21dc88948c8b74
5523153nb pers visite scolaire rep2020-09-03 16:32:37.068864+02:002022-02-18 15:57:55.792581+01:00NaN3cf21731c25eee650d5b232ee4780563
5535847visite scolaire rep1h002021-06-09 18:10:49.742531+02:002022-02-18 15:55:03.576236+01:00NaNa7bb5a6892d55f0d5ee4ce5786ae5fc6
5545840france billet - entree ts2021-06-09 18:10:49.737576+02:002022-02-18 16:16:00.199543+01:00NaN4c53016fc65847646f600eff853593e5
5555863france billet - entree tp2021-06-09 18:12:49.269924+02:002022-02-18 16:16:00.199543+01:00NaN90e642c0e1ef6bc9f2bc43089798de00
\n", "

556 rows × 6 columns

\n", "
" ], "text/plain": [ " id name created_at \\\n", "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n", "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n", "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n", "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n", "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n", ".. ... ... ... \n", "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n", "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n", "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n", "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n", "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n", "\n", " updated_at extra_field \\\n", "0 2022-07-08 07:08:26.802266+02:00 NaN \n", "1 2022-02-18 15:57:55.792581+01:00 NaN \n", "2 2021-01-25 19:16:05.187114+01:00 NaN \n", "3 2023-03-13 11:30:29.525335+01:00 NaN \n", "4 2021-01-25 19:23:06.816402+01:00 NaN \n", ".. ... ... \n", "551 2022-02-18 15:57:55.792581+01:00 NaN \n", "552 2022-02-18 15:57:55.792581+01:00 NaN \n", "553 2022-02-18 15:55:03.576236+01:00 NaN \n", "554 2022-02-18 16:16:00.199543+01:00 NaN \n", "555 2022-02-18 16:16:00.199543+01:00 NaN \n", "\n", " identifier \n", "0 21d4b0043c12b21952b0797d140991a1 \n", "1 223b09e6c3f1f75dbf8df019af97a555 \n", "2 ba33b7b6d225a75d713a356b49c4d915 \n", "3 a00b61ad933518856f86e63ca91a5750 \n", "4 7f6013803c242253a5ccde80f780984f \n", ".. ... \n", "551 7d888e42abe101fc8b21dc88948c8b74 \n", "552 3cf21731c25eee650d5b232ee4780563 \n", "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n", "554 4c53016fc65847646f600eff853593e5 \n", "555 90e642c0e1ef6bc9f2bc43089798de00 \n", "\n", "[556 rows x 6 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pricing_formulas" ] }, { "cell_type": "code", "execution_count": 14, "id": "d8130c73-6c5f-45b1-93ae-db7679c8ca56", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0.0\n", "name 0.0\n", "created_at 0.0\n", "updated_at 0.0\n", "extra_field 1.0\n", "identifier 0.0\n", "dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan \n", "\n", "pricing_formulas.isna().sum()/pricing_formulas.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "9f2909c1-bc6a-443f-a077-84f6ce6b7ab5", "metadata": {}, "outputs": [], "source": [ "#variable retenu: [[\"id\",\"name\"]]" ] }, { "cell_type": "code", "execution_count": 15, "id": "44f1dbfd-c3cf-464b-9877-f37fcc61da92", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "name object\n", "created_at object\n", "updated_at object\n", "extra_field float64\n", "identifier object\n", "dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "pricing_formulas.dtypes" ] }, { "cell_type": "code", "execution_count": 16, "id": "6784b41b-da74-4fae-832e-16641ae710c1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atextra_fieldidentifier
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, name, created_at, updated_at, extra_field, identifier]\n", "Index: []" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Identification des doublons\n", "pricing_formulas.loc[pricing_formulas[['id']].duplicated(keep=False),:]" ] }, { "cell_type": "markdown", "id": "2145b0a4-b73d-4530-8c12-a78b1cf86eae", "metadata": {}, "source": [ "## IV. product_packs" ] }, { "cell_type": "code", "execution_count": 17, "id": "e36b07a7-4f0b-4711-86a0-12a1d8158eef", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0.0\n", "name 1.0\n", "type_of 0.0\n", "created_at 0.0\n", "updated_at 0.0\n", "identifier 0.0\n", "dtype: float64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan \n", "\n", "product_packs.isna().sum()/product_packs.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "e0887a01-51ea-4034-84fe-dc4dbf2ad949", "metadata": {}, "outputs": [], "source": [ "#variable retenu:[[\"id\",\"name\"]]" ] }, { "cell_type": "code", "execution_count": 18, "id": "8707396a-f86b-476d-a9f9-c39f8de1d02e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "name float64\n", "type_of int64\n", "created_at object\n", "updated_at object\n", "identifier object\n", "dtype: object" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "product_packs.dtypes" ] }, { "cell_type": "code", "execution_count": 19, "id": "4b102bd3-924b-43da-8915-be7664c23f97", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnametype_ofcreated_atupdated_atidentifier
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, name, type_of, created_at, updated_at, identifier]\n", "Index: []" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Identification des doublons\n", "product_packs.loc[product_packs[['id']].duplicated(keep=False),:]" ] }, { "cell_type": "markdown", "id": "cfe0c525-896b-4731-b38e-306ff6ea0c65", "metadata": {}, "source": [ "## V.products" ] }, { "cell_type": "code", "execution_count": 20, "id": "968beb24-f70c-4eb6-8b1e-4b04bc7fe9c9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id 0.0\n", "amount 0.0\n", "is_full_price 0.0\n", "representation_id 0.0\n", "pricing_formula_id 0.0\n", "created_at 0.0\n", "updated_at 0.0\n", "category_id 0.0\n", "apply_price 0.0\n", "products_group_id 0.0\n", "product_pack_id 0.0\n", "extra_field 1.0\n", "amount_consumption 1.0\n", "identifier 0.0\n", "dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#detection des Nan \n", "\n", "products.isna().sum()/products.shape[0]" ] }, { "cell_type": "code", "execution_count": 21, "id": "15bc6ac6-67e8-4e2c-9641-7ee8bb2581a3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "id int64\n", "amount float64\n", "is_full_price bool\n", "representation_id int64\n", "pricing_formula_id int64\n", "created_at object\n", "updated_at object\n", "category_id int64\n", "apply_price float64\n", "products_group_id int64\n", "product_pack_id int64\n", "extra_field float64\n", "amount_consumption float64\n", "identifier object\n", "dtype: object" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#type des variables\n", "\n", "products.dtypes" ] }, { "cell_type": "code", "execution_count": 22, "id": "7daa4f1a-e429-4daf-a2e1-1e311b487e09", "metadata": {}, "outputs": [], "source": [ "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" ] }, { "cell_type": "code", "execution_count": 26, "id": "dc12b746-6708-4708-826a-acb5a8e665a1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atextra_fieldidentifier
041909visite mécènes 1h302022-07-08 07:08:26.802266+02:002022-07-08 07:08:26.802266+02:00NaN21d4b0043c12b21952b0797d140991a1
1502entree mucem tp( expo picasso)2020-09-03 13:43:59.816765+02:002022-02-18 15:57:55.792581+01:00NaN223b09e6c3f1f75dbf8df019af97a555
2504nombre de personnes cinema2020-09-03 13:43:59.818198+02:002021-01-25 19:16:05.187114+01:00NaNba33b7b6d225a75d713a356b49c4d915
3117spectacle tarif e famille tr2020-09-03 13:21:21.400249+02:002023-03-13 11:30:29.525335+01:00NaNa00b61ad933518856f86e63ca91a5750
41496billet nb famille mecene 1a2020-09-03 14:29:33.320952+02:002021-01-25 19:23:06.816402+01:00NaN7f6013803c242253a5ccde80f780984f
.....................
551529billet nb expo gr2020-09-03 13:43:59.835944+02:002022-02-18 15:57:55.792581+01:00NaN7d888e42abe101fc8b21dc88948c8b74
5523153nb pers visite scolaire rep2020-09-03 16:32:37.068864+02:002022-02-18 15:57:55.792581+01:00NaN3cf21731c25eee650d5b232ee4780563
5535847visite scolaire rep1h002021-06-09 18:10:49.742531+02:002022-02-18 15:55:03.576236+01:00NaNa7bb5a6892d55f0d5ee4ce5786ae5fc6
5545840france billet - entree ts2021-06-09 18:10:49.737576+02:002022-02-18 16:16:00.199543+01:00NaN4c53016fc65847646f600eff853593e5
5555863france billet - entree tp2021-06-09 18:12:49.269924+02:002022-02-18 16:16:00.199543+01:00NaN90e642c0e1ef6bc9f2bc43089798de00
\n", "

556 rows × 6 columns

\n", "
" ], "text/plain": [ " id name created_at \\\n", "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n", "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n", "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n", "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n", "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n", ".. ... ... ... \n", "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n", "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n", "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n", "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n", "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n", "\n", " updated_at extra_field \\\n", "0 2022-07-08 07:08:26.802266+02:00 NaN \n", "1 2022-02-18 15:57:55.792581+01:00 NaN \n", "2 2021-01-25 19:16:05.187114+01:00 NaN \n", "3 2023-03-13 11:30:29.525335+01:00 NaN \n", "4 2021-01-25 19:23:06.816402+01:00 NaN \n", ".. ... ... \n", "551 2022-02-18 15:57:55.792581+01:00 NaN \n", "552 2022-02-18 15:57:55.792581+01:00 NaN \n", "553 2022-02-18 15:55:03.576236+01:00 NaN \n", "554 2022-02-18 16:16:00.199543+01:00 NaN \n", "555 2022-02-18 16:16:00.199543+01:00 NaN \n", "\n", " identifier \n", "0 21d4b0043c12b21952b0797d140991a1 \n", "1 223b09e6c3f1f75dbf8df019af97a555 \n", "2 ba33b7b6d225a75d713a356b49c4d915 \n", "3 a00b61ad933518856f86e63ca91a5750 \n", "4 7f6013803c242253a5ccde80f780984f \n", ".. ... \n", "551 7d888e42abe101fc8b21dc88948c8b74 \n", "552 3cf21731c25eee650d5b232ee4780563 \n", "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n", "554 4c53016fc65847646f600eff853593e5 \n", "555 90e642c0e1ef6bc9f2bc43089798de00 \n", "\n", "[556 rows x 6 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pricing_formulas" ] }, { "cell_type": "markdown", "id": "46aad10f-8530-410e-872b-bb253c553a46", "metadata": {}, "source": [ "# jointure entre les bases" ] }, { "cell_type": "code", "execution_count": null, "id": "a4c3edd1-6d58-4c57-b3e4-0ef3529f6b8c", "metadata": {}, "outputs": [], "source": [ "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" ] }, { "cell_type": "code", "execution_count": 74, "id": "eac537e1-bbad-45bc-a85c-12b675da1088", "metadata": {}, "outputs": [], "source": [ "#Merge1 entre products et pricing_formulas\n", "base1=products.merge(pricing_formulas, how='left', left_on= 'pricing_formula_id', right_on= 'id', suffixes = (\"_products\", \"_pricing_formula\"))" ] }, { "cell_type": "code", "execution_count": 78, "id": "75be3a30-3114-432d-87d6-697533c3c871", "metadata": {}, "outputs": [], "source": [ "#Merge2 entre base1 et products_groups\n", "base2=base1.merge(products_groups, how='left', left_on= 'id_pricing_formula', right_on= 'id', suffixes = (\"_merge2\", \"_product_group\"))" ] }, { "cell_type": "code", "execution_count": 81, "id": "34a169c6-07a8-4ac3-a9e1-d7e7461f7310", "metadata": {}, "outputs": [], "source": [ "#Merge3 entre base2 et type_of_pricing_formulas\n", "base3=base2.merge(type_of_pricing_formulas, how='left', left_on= 'id_pricing_formula', right_on= 'pricing_formula_id', suffixes = (\"_merge3\", \"_type_of_pricing_f\"))" ] }, { "cell_type": "code", "execution_count": 89, "id": "f44f40d2-5304-4931-b7e6-fcc06b2657b6", "metadata": {}, "outputs": [], "source": [ "#Merge4 entre base3 et type_of_pricing_formulas\n", "df_product_pricing=base3.merge(product_packs, how='left', left_on= 'product_pack_id', right_on= 'id', suffixes = (\"_merge4\", \"_product_pack\"))" ] }, { "cell_type": "code", "execution_count": 90, "id": "a28772c3-7bc1-46b4-acc8-1388dc60ec98", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_productsamountis_full_pricerepresentation_id_merge2pricing_formula_id_merge2created_at_productsupdated_at_productscategory_id_merge2apply_priceproducts_group_id...pricing_formula_idcreated_at_type_of_pricing_fupdated_at_type_of_pricing_fidentifier_merge4idname_product_packtype_ofcreated_atupdated_atidentifier_product_pack
0106829.0False9141142020-09-03 14:09:43.119798+02:002020-09-03 14:09:43.119798+02:00410.010655...114.02021-02-15 17:02:27.395376+01:002021-02-15 17:02:27.395376+01:003706121eb9f43b635bef1433c06f679c1NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
14789.5False2731312020-09-03 13:21:22.711773+02:002020-09-03 13:21:22.711773+02:0010.0471...131.02021-02-05 11:52:05.923905+01:002021-02-05 11:52:05.923905+01:000aceb248607671792298436004b952751NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
22087311.5False2751372020-09-03 14:46:33.589030+02:002020-09-03 14:46:33.589030+02:0010.020825...137.02021-02-05 11:52:05.939898+01:002021-02-05 11:52:05.939898+01:0093002d4637331edd81ffc28b6e8e89c01NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
31571428.0False8251992022-01-28 19:29:23.525722+01:002022-01-28 19:29:23.525722+01:0050.0156773...9.02021-02-05 11:52:06.107939+01:002021-02-05 11:52:06.107939+01:007d0b25bdfff9f366da8be820608c81911NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
413418.5False9932020-09-03 13:29:30.773089+02:002020-09-03 13:29:30.773089+02:0010.01175...93.02021-02-05 11:52:06.004162+01:002021-02-05 11:52:06.004162+01:001dbb0795e8f47cb75ba7cdb08c06be5f1NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
\n", "

5 rows × 41 columns

\n", "
" ], "text/plain": [ " id_products amount is_full_price representation_id_merge2 \\\n", "0 10682 9.0 False 914 \n", "1 478 9.5 False 273 \n", "2 20873 11.5 False 275 \n", "3 157142 8.0 False 82519 \n", "4 1341 8.5 False 9 \n", "\n", " pricing_formula_id_merge2 created_at_products \\\n", "0 114 2020-09-03 14:09:43.119798+02:00 \n", "1 131 2020-09-03 13:21:22.711773+02:00 \n", "2 137 2020-09-03 14:46:33.589030+02:00 \n", "3 9 2022-01-28 19:29:23.525722+01:00 \n", "4 93 2020-09-03 13:29:30.773089+02:00 \n", "\n", " updated_at_products category_id_merge2 apply_price \\\n", "0 2020-09-03 14:09:43.119798+02:00 41 0.0 \n", "1 2020-09-03 13:21:22.711773+02:00 1 0.0 \n", "2 2020-09-03 14:46:33.589030+02:00 1 0.0 \n", "3 2022-01-28 19:29:23.525722+01:00 5 0.0 \n", "4 2020-09-03 13:29:30.773089+02:00 1 0.0 \n", "\n", " products_group_id ... pricing_formula_id \\\n", "0 10655 ... 114.0 \n", "1 471 ... 131.0 \n", "2 20825 ... 137.0 \n", "3 156773 ... 9.0 \n", "4 1175 ... 93.0 \n", "\n", " created_at_type_of_pricing_f updated_at_type_of_pricing_f \\\n", "0 2021-02-15 17:02:27.395376+01:00 2021-02-15 17:02:27.395376+01:00 \n", "1 2021-02-05 11:52:05.923905+01:00 2021-02-05 11:52:05.923905+01:00 \n", "2 2021-02-05 11:52:05.939898+01:00 2021-02-05 11:52:05.939898+01:00 \n", "3 2021-02-05 11:52:06.107939+01:00 2021-02-05 11:52:06.107939+01:00 \n", "4 2021-02-05 11:52:06.004162+01:00 2021-02-05 11:52:06.004162+01:00 \n", "\n", " identifier_merge4 id name_product_pack type_of \\\n", "0 3706121eb9f43b635bef1433c06f679c 1 NaN 0 \n", "1 0aceb248607671792298436004b95275 1 NaN 0 \n", "2 93002d4637331edd81ffc28b6e8e89c0 1 NaN 0 \n", "3 7d0b25bdfff9f366da8be820608c8191 1 NaN 0 \n", "4 1dbb0795e8f47cb75ba7cdb08c06be5f 1 NaN 0 \n", "\n", " created_at updated_at \\\n", "0 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "1 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "2 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "3 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "4 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", "\n", " identifier_product_pack \n", "0 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "1 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "2 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "3 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "4 a764b4bf13a360c7ac2a35ec4ca96c95 \n", "\n", "[5 rows x 41 columns]" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_product_pricing.head(5)" ] }, { "cell_type": "code", "execution_count": null, "id": "03442997-806f-4285-a139-3bad46bb4522", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "id": "d22a0d75-53c5-4b54-9060-c9e7c307fb13", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data\"\n", "directory_path = '2'" ] }, { "cell_type": "code", "execution_count": 11, "id": "7c229dad-6ebd-4f43-99f1-fb330dc29466", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/2/2campaign_stats.csv',\n", " 'bdc2324-data/2/2campaigns.csv',\n", " 'bdc2324-data/2/2categories.csv',\n", " 'bdc2324-data/2/2contribution_sites.csv',\n", " 'bdc2324-data/2/2contributions.csv',\n", " 'bdc2324-data/2/2countries.csv',\n", " 'bdc2324-data/2/2currencies.csv',\n", " 'bdc2324-data/2/2customer_target_mappings.csv',\n", " 'bdc2324-data/2/2customersplus.csv',\n", " 'bdc2324-data/2/2event_types.csv',\n", " 'bdc2324-data/2/2events.csv',\n", " 'bdc2324-data/2/2facilities.csv',\n", " 'bdc2324-data/2/2link_stats.csv',\n", " 'bdc2324-data/2/2pricing_formulas.csv',\n", " 'bdc2324-data/2/2product_packs.csv',\n", " 'bdc2324-data/2/2products.csv',\n", " 'bdc2324-data/2/2products_groups.csv',\n", " 'bdc2324-data/2/2purchases.csv',\n", " 'bdc2324-data/2/2representation_category_capacities.csv',\n", " 'bdc2324-data/2/2representations.csv',\n", " 'bdc2324-data/2/2seasons.csv',\n", " 'bdc2324-data/2/2structure_tag_mappings.csv',\n", " 'bdc2324-data/2/2suppliers.csv',\n", " 'bdc2324-data/2/2tags.csv',\n", " 'bdc2324-data/2/2target_types.csv',\n", " 'bdc2324-data/2/2targets.csv',\n", " 'bdc2324-data/2/2tickets.csv']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BUCKET = \"bdc2324-data/2\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 12, "id": "df3d3548-3d76-4f07-afa1-e240932bc1c7", "metadata": {}, "outputs": [], "source": [ "dic_base_ent2=['campaign_stats','campaigns','categories','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']" ] }, { "cell_type": "code", "execution_count": 13, "id": "90f8d5fc-43f3-4f36-b8cc-89a41785f032", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_438/673681459.py:5: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "dic_base_ent2=['campaign_stats','campaigns','categories','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']\n", "for nom_base in dic_base_ent2:\n", " FILE_PATH_S3_fanta = 'bdc2324-data/2/2' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "3e39a584-e02b-41b2-831c-33b920e298e9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "27" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dic_base_ent2)" ] }, { "cell_type": "code", "execution_count": null, "id": "2b6c6f65", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def calculer_proportion_valeurs_manquantes_et_exporter(databases, fichier_sortie='proportion_valeurs_manquantes.xlsx'):\n", " \"\"\"\n", " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données et exporter les résultats dans un fichier Excel.\n", "\n", " Paramètres:\n", " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", " - fichier_sortie (str): Le chemin du fichier Excel de sortie.\n", "\n", " Retourne:\n", " - Un fichier Excel où chaque onglet représente une base de données différente avec la proportion de valeurs manquantes pour chaque variable.\n", " \"\"\"\n", " with pd.ExcelWriter(fichier_sortie) as writer:\n", " for nom_db, df in databases.items():\n", " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", " proportion_manquantes = df.isnull().mean()\n", " # Convertir en DataFrame pour un meilleur affichage\n", " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", " # Écrire le DataFrame dans un onglet du fichier Excel\n", " resultats_df.to_excel(writer, sheet_name=nom_db)\n", "\n", " print(f\"Les résultats ont été exportés dans le fichier '{fichier_sortie}'.\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "06759646-9419-4841-b12f-bbfceb417f3a", "metadata": {}, "outputs": [], "source": [ "#fonction calcul la proportion de valeur manquante\n", "\n", "import pandas as pd\n", "\n", "def calculer_proportion_valeurs_manquantes(databases):\n", " \"\"\"\n", " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données.\n", "\n", " Paramètres:\n", " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", "\n", " Retourne:\n", " - Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont des DataFrames avec la proportion de valeurs manquantes pour chaque variable.\n", " \"\"\"\n", " resultats = {}\n", " for nom_db, df in databases.items():\n", " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", " proportion_manquantes = df.isnull().mean()\n", " # Convertir en DataFrame pour un meilleur affichage\n", " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", " # Ajouter le résultat au dictionnaire\n", " resultats[nom_db] = resultats_df\n", " return resultats" ] }, { "cell_type": "code", "execution_count": null, "id": "0960daa8", "metadata": {}, "outputs": [], "source": [ "# Exemple d'utilisation\n", "\n", "databases = {'Base1': campaign_stats, 'Base2': campaigns}\n", "\n", "resultats = calculer_proportion_valeurs_manquantes(databases)\n", "\n", "for nom_db, resultat in resultats.items():\n", " print(f\"Base de données: {nom_db}\")\n", " print(resultat)\n", " print()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "77dc02bb", "metadata": {}, "outputs": [], "source": [ "# Exemple d'utilisation\n", "dict={'campaign_stats': campaign_stats,\n", " 'campaigns': campaigns,\n", " 'categories': categories,\n", " 'contribution_sites': contribution_sites,\n", " 'contributions': contributions,\n", " 'countries': countries,\n", " 'currencies': currencies,\n", " 'customer_target_mappings': customer_target_mappings,\n", " 'customersplus': customersplus,\n", " 'event_types': event_types,\n", " 'events': events,\n", " 'facilities': facilities,\n", " 'link_stats': link_stats,\n", " 'pricing_formulas': pricing_formulas,\n", " 'product_packs': product_packs,\n", " 'products': products,\n", " 'products_groups': products_groups,\n", " 'purchases': purchases,\n", " 'representation_category_capacities': representation_category_capacities,\n", " 'representations': representations,\n", " 'seasons': seasons,\n", " 'structure_tag_mappings': structure_tag_mappings,\n", " 'suppliers': suppliers,\n", " 'tags': tags,\n", " 'target_types': target_types,\n", " 'targets': targets,\n", " 'tickets': tickets}\n", "\n", "resultats = calculer_proportion_valeurs_manquantes(dict)\n", "\n", "for nom_db, resultat in resultats.items():\n", " print(f\"Base de données: {nom_db}\")\n", " print(resultat)\n", " print()" ] }, { "cell_type": "code", "execution_count": 1, "id": "60be9271", "metadata": {}, "outputs": [], "source": [ "#MEME CODE mais avec l'exportation de result a en format excel" ] }, { "cell_type": "code", "execution_count": null, "id": "955fe358", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def calculer_proportion_valeurs_manquantes_et_exporter(databases, fichier_sortie='proportion_valeurs_manquantes.xlsx'):\n", " \"\"\"\n", " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données et exporter les résultats dans un fichier Excel.\n", "\n", " Paramètres:\n", " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", " - fichier_sortie (str): Le chemin du fichier Excel de sortie.\n", "\n", " Retourne:\n", " - Un fichier Excel où chaque onglet représente une base de données différente avec la proportion de valeurs manquantes pour chaque variable.\n", " \"\"\"\n", " with pd.ExcelWriter(fichier_sortie) as writer:\n", " for nom_db, df in databases.items():\n", " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", " proportion_manquantes = df.isnull().mean()\n", " # Convertir en DataFrame pour un meilleur affichage\n", " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", " # Écrire le DataFrame dans un onglet du fichier Excel\n", " resultats_df.to_excel(writer, sheet_name=nom_db)\n", "\n", " print(f\"Les résultats ont été exportés dans le fichier '{fichier_sortie}'.\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7897b689", "metadata": {}, "outputs": [], "source": [ "# Exemple d'utilisation\n", "dict={'campaign_stats': campaign_stats,\n", " 'campaigns': campaigns,\n", " 'categories': categories,\n", " 'contribution_sites': contribution_sites,\n", " 'contributions': contributions,\n", " 'countries': countries,\n", " 'currencies': currencies,\n", " 'customer_target_mappings': customer_target_mappings,\n", " 'customersplus': customersplus,\n", " 'event_types': event_types,\n", " 'events': events,\n", " 'facilities': facilities,\n", " 'link_stats': link_stats,\n", " 'pricing_formulas': pricing_formulas,\n", " 'product_packs': product_packs,\n", " 'products': products,\n", " 'products_groups': products_groups,\n", " 'purchases': purchases,\n", " 'representation_category_capacities': representation_category_capacities,\n", " 'representations': representations,\n", " 'seasons': seasons,\n", " 'structure_tag_mappings': structure_tag_mappings,\n", " 'suppliers': suppliers,\n", " 'tags': tags,\n", " 'target_types': target_types,\n", " 'targets': targets,\n", " 'tickets': tickets}\n", "\n", "calculer_proportion_valeurs_manquantes_et_exporter(dict, 'proportion_valeurs_manquantes_ent1.xlsx')\n" ] }, { "cell_type": "markdown", "id": "514273f4", "metadata": {}, "source": [ "## Entreprise 3" ] }, { "cell_type": "code", "execution_count": null, "id": "69b8f59a", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data/3\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": null, "id": "9b4c005f", "metadata": {}, "outputs": [], "source": [ "dic_base_ent3=['campaign_stats','campaigns','categories','consumptions','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']" ] }, { "cell_type": "code", "execution_count": null, "id": "aae542d6", "metadata": {}, "outputs": [], "source": [ "dic_base_ent3=['campaign_stats','campaigns','categories','consumptions','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']\n", "for nom_base in dic_base_ent2:\n", " FILE_PATH_S3_fanta = 'bdc2324-data/3/3' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d4b2df76", "metadata": {}, "outputs": [], "source": [ "dict4={'campaign_stats': campaign_stats,\n", " 'campaigns': campaigns,\n", " 'categories': categories,\n", " 'consumptions': consumptions,\n", " 'contribution_sites': contribution_sites,\n", " 'contributions': contributions,\n", " 'countries': countries,\n", " 'currencies': currencies,\n", " 'customer_target_mappings': customer_target_mappings,\n", " 'customersplus': customersplus,\n", " 'event_types': event_types,\n", " 'events': events,\n", " 'facilities': facilities,\n", " 'link_stats': link_stats,\n", " 'pricing_formulas': pricing_formulas,\n", " 'product_packs': product_packs,\n", " 'products': products,\n", " 'products_groups': products_groups,\n", " 'purchases': purchases,\n", " 'representation_category_capacities': representation_category_capacities,\n", " 'representations': representations,\n", " 'seasons': seasons,\n", " 'structure_tag_mappings': structure_tag_mappings,\n", " 'suppliers': suppliers,\n", " 'tags': tags,\n", " 'target_types': target_types,\n", " 'targets': targets,\n", " 'tickets': tickets}" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }