diff --git a/code_valeur manquante.ipynb b/code_valeur manquante.ipynb new file mode 100644 index 0000000..d37e139 --- /dev/null +++ b/code_valeur manquante.ipynb @@ -0,0 +1,2116 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c4205b5d-e052-4863-a46b-20e4757052a7", + "metadata": {}, + "source": [ + "# Business Data Challenge - Team 1" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ae3af8e6-ced8-4994-8877-fa98d4297cc0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "dd3184e7-54a1-4463-af42-5850d9517a41", + "metadata": {}, + "source": [ + "Configuration de l'accès aux données" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b6035982-9ff4-4013-9792-2d50e10db3d1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1/1campaign_stats.csv',\n", + " 'bdc2324-data/1/1campaigns.csv',\n", + " 'bdc2324-data/1/1categories.csv',\n", + " 'bdc2324-data/1/1countries.csv',\n", + " 'bdc2324-data/1/1currencies.csv',\n", + " 'bdc2324-data/1/1customer_target_mappings.csv',\n", + " 'bdc2324-data/1/1customersplus.csv',\n", + " 'bdc2324-data/1/1event_types.csv',\n", + " 'bdc2324-data/1/1events.csv',\n", + " 'bdc2324-data/1/1facilities.csv',\n", + " 'bdc2324-data/1/1link_stats.csv',\n", + " 'bdc2324-data/1/1pricing_formulas.csv',\n", + " 'bdc2324-data/1/1product_packs.csv',\n", + " 'bdc2324-data/1/1products.csv',\n", + " 'bdc2324-data/1/1products_groups.csv',\n", + " 'bdc2324-data/1/1purchases.csv',\n", + " 'bdc2324-data/1/1representation_category_capacities.csv',\n", + " 'bdc2324-data/1/1representations.csv',\n", + " 'bdc2324-data/1/1seasons.csv',\n", + " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", + " 'bdc2324-data/1/1suppliers.csv',\n", + " 'bdc2324-data/1/1tags.csv',\n", + " 'bdc2324-data/1/1target_types.csv',\n", + " 'bdc2324-data/1/1targets.csv',\n", + " 'bdc2324-data/1/1tickets.csv',\n", + " 'bdc2324-data/1/1type_of_categories.csv',\n", + " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", + " 'bdc2324-data/1/1type_ofs.csv']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import s3fs\n", + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", + "\n", + "BUCKET = \"bdc2324-data/1\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b86c935d-124f-453f-80dd-83ea6770d09c", + "metadata": {}, + "outputs": [], + "source": [ + "dic_base=['campaign_stats','campaigns','categories','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets','type_of_categories','type_of_pricing_formulas','type_ofs']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f6d0b27c-0ecd-406b-b042-6c3802dd68fd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_438/1008972637.py:5: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n" + ] + } + ], + "source": [ + "dic_base=['campaign_stats','campaigns','categories','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets','type_of_categories','type_of_pricing_formulas','type_ofs']\n", + "for nom_base in dic_base:\n", + " FILE_PATH_S3_fanta = 'bdc2324-data/1/1' + nom_base + '.csv'\n", + " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", + " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2a6b5e22-3370-457f-83b7-dd1e13663229", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bdc2324-data/1/1type_ofs.csv'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "FILE_PATH_S3_fanta" + ] + }, + { + "cell_type": "markdown", + "id": "79012186-ea51-4252-843e-36a9bbe3847e", + "metadata": {}, + "source": [ + "# Analyse exploratoire " + ] + }, + { + "cell_type": "markdown", + "id": "1a365f29-4766-47d8-9796-24a5271867b2", + "metadata": {}, + "source": [ + "## I. Base type_of_pricing_formulas" + ] + }, + { + "cell_type": "markdown", + "id": "bcc14f93-2289-44eb-816b-a51049b258df", + "metadata": {}, + "source": [ + "## Detection des valeur manquantes" + ] + }, + { + "cell_type": "raw", + "id": "ab2ec4c4-9d38-4aeb-8202-9116df3cdd66", + "metadata": {}, + "source": [ + "dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" + ] + }, + { + "cell_type": "markdown", + "id": "88759b4a-2633-478d-abce-29abeac376d1", + "metadata": {}, + "source": [ + "def verifier_donnees_manquantes(base):\n", + " donnees_manquantes = base.isna().sum()\n", + " print(\"Données manquantes pour la base :\")\n", + " print(donnees_manquantes)" + ] + }, + { + "cell_type": "markdown", + "id": "df3075b4-1490-4cf2-a3fe-c6d4e2144ae3", + "metadata": {}, + "source": [ + "for nom_base in dic_prod_princing:\n", + " verifier_donnees_manquantes(nom_base)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e0c67c01-e837-4772-b070-d1be0d895a36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "type_of_id 0\n", + "pricing_formula_id 0\n", + "created_at 0\n", + "updated_at 0\n", + "identifier 0\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan d\n", + "\n", + "type_of_pricing_formulas.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83a6a48d-effe-4537-b4bb-d5a540b610f1", + "metadata": {}, + "outputs": [], + "source": [ + "#variable retenu:[[\"id\",\"type_of_id\",\"pricing_formula_id\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3eaffaa6-1164-4ee9-a671-8b5eb3df797d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtype_of_idpricing_formula_idcreated_atupdated_atidentifier
0111272021-01-05 11:55:51.226960+01:002021-01-05 11:55:51.226960+01:00cf2918b25e6dcf8c30798ca05c8ec8ed
12124252021-01-05 11:55:51.235606+01:002021-01-05 11:55:51.235606+01:002c8ee3f7c1487d792b6c946314e681f2
23129372021-01-05 11:55:51.240114+01:002021-01-05 11:55:51.240114+01:0044e55c85e4eb59b3c3c01c137a6b25fc
341482021-01-05 11:55:51.244638+01:002021-01-05 11:55:51.244638+01:00ee3bb93b7e2217cd86a49d547fedf6c6
45172021-01-05 11:55:51.249409+01:002021-01-05 11:55:51.249409+01:00ae701668574f1a653d2b21ddfd250620
.....................
563564466562022-02-18 16:15:58.872249+01:002022-02-18 16:15:58.872249+01:00f669824cdca9de9697f07ff3ba365a8d
564565466072022-02-18 16:15:59.231018+01:002022-02-18 16:15:59.231018+01:006421c8146a598758139153b0e7b921ea
565566467002022-02-18 16:15:59.724812+01:002022-02-18 16:15:59.724812+01:006823f6d4d80b322fbfb8b83545a9f96d
566567481182022-02-18 16:16:00.163381+01:002022-02-18 16:16:00.163381+01:0035cfc12584b4d1b94795d97fd0aa56e8
5675697481572023-03-13 11:30:29.480161+01:002023-03-13 11:30:29.480161+01:0055863541f33fd229ac9b54d9ec1f4874
\n", + "

568 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id type_of_id pricing_formula_id created_at \\\n", + "0 1 1 127 2021-01-05 11:55:51.226960+01:00 \n", + "1 2 1 2425 2021-01-05 11:55:51.235606+01:00 \n", + "2 3 1 2937 2021-01-05 11:55:51.240114+01:00 \n", + "3 4 1 48 2021-01-05 11:55:51.244638+01:00 \n", + "4 5 1 7 2021-01-05 11:55:51.249409+01:00 \n", + ".. ... ... ... ... \n", + "563 564 4 6656 2022-02-18 16:15:58.872249+01:00 \n", + "564 565 4 6607 2022-02-18 16:15:59.231018+01:00 \n", + "565 566 4 6700 2022-02-18 16:15:59.724812+01:00 \n", + "566 567 4 8118 2022-02-18 16:16:00.163381+01:00 \n", + "567 569 7 48157 2023-03-13 11:30:29.480161+01:00 \n", + "\n", + " updated_at identifier \n", + "0 2021-01-05 11:55:51.226960+01:00 cf2918b25e6dcf8c30798ca05c8ec8ed \n", + "1 2021-01-05 11:55:51.235606+01:00 2c8ee3f7c1487d792b6c946314e681f2 \n", + "2 2021-01-05 11:55:51.240114+01:00 44e55c85e4eb59b3c3c01c137a6b25fc \n", + "3 2021-01-05 11:55:51.244638+01:00 ee3bb93b7e2217cd86a49d547fedf6c6 \n", + "4 2021-01-05 11:55:51.249409+01:00 ae701668574f1a653d2b21ddfd250620 \n", + ".. ... ... \n", + "563 2022-02-18 16:15:58.872249+01:00 f669824cdca9de9697f07ff3ba365a8d \n", + "564 2022-02-18 16:15:59.231018+01:00 6421c8146a598758139153b0e7b921ea \n", + "565 2022-02-18 16:15:59.724812+01:00 6823f6d4d80b322fbfb8b83545a9f96d \n", + "566 2022-02-18 16:16:00.163381+01:00 35cfc12584b4d1b94795d97fd0aa56e8 \n", + "567 2023-03-13 11:30:29.480161+01:00 55863541f33fd229ac9b54d9ec1f4874 \n", + "\n", + "[568 rows x 6 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_of_pricing_formulas" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "57298669-8d55-40d5-a5aa-4c5df984eec7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "type_of_id int64\n", + "pricing_formula_id int64\n", + "created_at object\n", + "updated_at object\n", + "identifier object\n", + "dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "type_of_pricing_formulas.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c11850cb-8833-44c0-a11d-9695d620a42b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtype_of_idpricing_formula_idcreated_atupdated_atidentifier
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, type_of_id, pricing_formula_id, created_at, updated_at, identifier]\n", + "Index: []" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Identification des doublons\n", + "type_of_pricing_formulas.loc[type_of_pricing_formulas['id'].duplicated(keep=False),:]" + ] + }, + { + "cell_type": "markdown", + "id": "7a40de03-5e18-4d3d-a0f8-da960c29fad8", + "metadata": {}, + "source": [ + "## II.products_groups" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "89909175-6734-4e8e-8632-d6f8ca812388", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "percent_price 0\n", + "max_price 0\n", + "min_price 0\n", + "category_id 0\n", + "pricing_formula_id 0\n", + "representation_id 0\n", + "created_at 0\n", + "updated_at 0\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan \n", + "\n", + "products_groups.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0518684-c83c-4f0a-89ea-d7dcfd60051d", + "metadata": {}, + "outputs": [], + "source": [ + "#variable retenu:[[\"id\",\"percent_price\",\"max_price\",\"min_price\",\"category_id\",\"pricing_formula_id\",\"representation_id\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6a187170-96c4-48d2-9568-b270f67e2c27", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "percent_price float64\n", + "max_price float64\n", + "min_price float64\n", + "category_id int64\n", + "pricing_formula_id int64\n", + "representation_id int64\n", + "created_at object\n", + "updated_at object\n", + "dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "products_groups.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2fba2cb0-a6a4-43b2-a854-3be07939c28b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpercent_pricemax_pricemin_pricecategory_idpricing_formula_idrepresentation_idcreated_atupdated_at
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, percent_price, max_price, min_price, category_id, pricing_formula_id, representation_id, created_at, updated_at]\n", + "Index: []" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Identification des doublons\n", + "products_groups.loc[products_groups[['id','pricing_formula_id','representation_id']].duplicated(keep=False),:]" + ] + }, + { + "cell_type": "markdown", + "id": "5312ac13-8fbd-4c3f-a98a-8c28f079a599", + "metadata": {}, + "source": [ + "## III.pricing_formulas" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3383a773-0817-4b23-84e7-8d5d0c74b179", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecreated_atupdated_atextra_fieldidentifier
041909visite mécènes 1h302022-07-08 07:08:26.802266+02:002022-07-08 07:08:26.802266+02:00NaN21d4b0043c12b21952b0797d140991a1
1502entree mucem tp( expo picasso)2020-09-03 13:43:59.816765+02:002022-02-18 15:57:55.792581+01:00NaN223b09e6c3f1f75dbf8df019af97a555
2504nombre de personnes cinema2020-09-03 13:43:59.818198+02:002021-01-25 19:16:05.187114+01:00NaNba33b7b6d225a75d713a356b49c4d915
3117spectacle tarif e famille tr2020-09-03 13:21:21.400249+02:002023-03-13 11:30:29.525335+01:00NaNa00b61ad933518856f86e63ca91a5750
41496billet nb famille mecene 1a2020-09-03 14:29:33.320952+02:002021-01-25 19:23:06.816402+01:00NaN7f6013803c242253a5ccde80f780984f
.....................
551529billet nb expo gr2020-09-03 13:43:59.835944+02:002022-02-18 15:57:55.792581+01:00NaN7d888e42abe101fc8b21dc88948c8b74
5523153nb pers visite scolaire rep2020-09-03 16:32:37.068864+02:002022-02-18 15:57:55.792581+01:00NaN3cf21731c25eee650d5b232ee4780563
5535847visite scolaire rep1h002021-06-09 18:10:49.742531+02:002022-02-18 15:55:03.576236+01:00NaNa7bb5a6892d55f0d5ee4ce5786ae5fc6
5545840france billet - entree ts2021-06-09 18:10:49.737576+02:002022-02-18 16:16:00.199543+01:00NaN4c53016fc65847646f600eff853593e5
5555863france billet - entree tp2021-06-09 18:12:49.269924+02:002022-02-18 16:16:00.199543+01:00NaN90e642c0e1ef6bc9f2bc43089798de00
\n", + "

556 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id name created_at \\\n", + "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n", + "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n", + "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n", + "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n", + "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n", + ".. ... ... ... \n", + "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n", + "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n", + "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n", + "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n", + "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n", + "\n", + " updated_at extra_field \\\n", + "0 2022-07-08 07:08:26.802266+02:00 NaN \n", + "1 2022-02-18 15:57:55.792581+01:00 NaN \n", + "2 2021-01-25 19:16:05.187114+01:00 NaN \n", + "3 2023-03-13 11:30:29.525335+01:00 NaN \n", + "4 2021-01-25 19:23:06.816402+01:00 NaN \n", + ".. ... ... \n", + "551 2022-02-18 15:57:55.792581+01:00 NaN \n", + "552 2022-02-18 15:57:55.792581+01:00 NaN \n", + "553 2022-02-18 15:55:03.576236+01:00 NaN \n", + "554 2022-02-18 16:16:00.199543+01:00 NaN \n", + "555 2022-02-18 16:16:00.199543+01:00 NaN \n", + "\n", + " identifier \n", + "0 21d4b0043c12b21952b0797d140991a1 \n", + "1 223b09e6c3f1f75dbf8df019af97a555 \n", + "2 ba33b7b6d225a75d713a356b49c4d915 \n", + "3 a00b61ad933518856f86e63ca91a5750 \n", + "4 7f6013803c242253a5ccde80f780984f \n", + ".. ... \n", + "551 7d888e42abe101fc8b21dc88948c8b74 \n", + "552 3cf21731c25eee650d5b232ee4780563 \n", + "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n", + "554 4c53016fc65847646f600eff853593e5 \n", + "555 90e642c0e1ef6bc9f2bc43089798de00 \n", + "\n", + "[556 rows x 6 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pricing_formulas" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d8130c73-6c5f-45b1-93ae-db7679c8ca56", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "name 0.0\n", + "created_at 0.0\n", + "updated_at 0.0\n", + "extra_field 1.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan \n", + "\n", + "pricing_formulas.isna().sum()/pricing_formulas.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f2909c1-bc6a-443f-a077-84f6ce6b7ab5", + "metadata": {}, + "outputs": [], + "source": [ + "#variable retenu: [[\"id\",\"name\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "44f1dbfd-c3cf-464b-9877-f37fcc61da92", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "name object\n", + "created_at object\n", + "updated_at object\n", + "extra_field float64\n", + "identifier object\n", + "dtype: object" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "pricing_formulas.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6784b41b-da74-4fae-832e-16641ae710c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecreated_atupdated_atextra_fieldidentifier
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, name, created_at, updated_at, extra_field, identifier]\n", + "Index: []" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Identification des doublons\n", + "pricing_formulas.loc[pricing_formulas[['id']].duplicated(keep=False),:]" + ] + }, + { + "cell_type": "markdown", + "id": "2145b0a4-b73d-4530-8c12-a78b1cf86eae", + "metadata": {}, + "source": [ + "## IV. product_packs" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e36b07a7-4f0b-4711-86a0-12a1d8158eef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "name 1.0\n", + "type_of 0.0\n", + "created_at 0.0\n", + "updated_at 0.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan \n", + "\n", + "product_packs.isna().sum()/product_packs.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0887a01-51ea-4034-84fe-dc4dbf2ad949", + "metadata": {}, + "outputs": [], + "source": [ + "#variable retenu:[[\"id\",\"name\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8707396a-f86b-476d-a9f9-c39f8de1d02e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "name float64\n", + "type_of int64\n", + "created_at object\n", + "updated_at object\n", + "identifier object\n", + "dtype: object" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "product_packs.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4b102bd3-924b-43da-8915-be7664c23f97", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnametype_ofcreated_atupdated_atidentifier
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, name, type_of, created_at, updated_at, identifier]\n", + "Index: []" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Identification des doublons\n", + "product_packs.loc[product_packs[['id']].duplicated(keep=False),:]" + ] + }, + { + "cell_type": "markdown", + "id": "cfe0c525-896b-4731-b38e-306ff6ea0c65", + "metadata": {}, + "source": [ + "## V.products" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "968beb24-f70c-4eb6-8b1e-4b04bc7fe9c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "amount 0.0\n", + "is_full_price 0.0\n", + "representation_id 0.0\n", + "pricing_formula_id 0.0\n", + "created_at 0.0\n", + "updated_at 0.0\n", + "category_id 0.0\n", + "apply_price 0.0\n", + "products_group_id 0.0\n", + "product_pack_id 0.0\n", + "extra_field 1.0\n", + "amount_consumption 1.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan \n", + "\n", + "products.isna().sum()/products.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "15bc6ac6-67e8-4e2c-9641-7ee8bb2581a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "amount float64\n", + "is_full_price bool\n", + "representation_id int64\n", + "pricing_formula_id int64\n", + "created_at object\n", + "updated_at object\n", + "category_id int64\n", + "apply_price float64\n", + "products_group_id int64\n", + "product_pack_id int64\n", + "extra_field float64\n", + "amount_consumption float64\n", + "identifier object\n", + "dtype: object" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "products.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7daa4f1a-e429-4daf-a2e1-1e311b487e09", + "metadata": {}, + "outputs": [], + "source": [ + "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "dc12b746-6708-4708-826a-acb5a8e665a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecreated_atupdated_atextra_fieldidentifier
041909visite mécènes 1h302022-07-08 07:08:26.802266+02:002022-07-08 07:08:26.802266+02:00NaN21d4b0043c12b21952b0797d140991a1
1502entree mucem tp( expo picasso)2020-09-03 13:43:59.816765+02:002022-02-18 15:57:55.792581+01:00NaN223b09e6c3f1f75dbf8df019af97a555
2504nombre de personnes cinema2020-09-03 13:43:59.818198+02:002021-01-25 19:16:05.187114+01:00NaNba33b7b6d225a75d713a356b49c4d915
3117spectacle tarif e famille tr2020-09-03 13:21:21.400249+02:002023-03-13 11:30:29.525335+01:00NaNa00b61ad933518856f86e63ca91a5750
41496billet nb famille mecene 1a2020-09-03 14:29:33.320952+02:002021-01-25 19:23:06.816402+01:00NaN7f6013803c242253a5ccde80f780984f
.....................
551529billet nb expo gr2020-09-03 13:43:59.835944+02:002022-02-18 15:57:55.792581+01:00NaN7d888e42abe101fc8b21dc88948c8b74
5523153nb pers visite scolaire rep2020-09-03 16:32:37.068864+02:002022-02-18 15:57:55.792581+01:00NaN3cf21731c25eee650d5b232ee4780563
5535847visite scolaire rep1h002021-06-09 18:10:49.742531+02:002022-02-18 15:55:03.576236+01:00NaNa7bb5a6892d55f0d5ee4ce5786ae5fc6
5545840france billet - entree ts2021-06-09 18:10:49.737576+02:002022-02-18 16:16:00.199543+01:00NaN4c53016fc65847646f600eff853593e5
5555863france billet - entree tp2021-06-09 18:12:49.269924+02:002022-02-18 16:16:00.199543+01:00NaN90e642c0e1ef6bc9f2bc43089798de00
\n", + "

556 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id name created_at \\\n", + "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n", + "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n", + "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n", + "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n", + "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n", + ".. ... ... ... \n", + "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n", + "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n", + "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n", + "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n", + "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n", + "\n", + " updated_at extra_field \\\n", + "0 2022-07-08 07:08:26.802266+02:00 NaN \n", + "1 2022-02-18 15:57:55.792581+01:00 NaN \n", + "2 2021-01-25 19:16:05.187114+01:00 NaN \n", + "3 2023-03-13 11:30:29.525335+01:00 NaN \n", + "4 2021-01-25 19:23:06.816402+01:00 NaN \n", + ".. ... ... \n", + "551 2022-02-18 15:57:55.792581+01:00 NaN \n", + "552 2022-02-18 15:57:55.792581+01:00 NaN \n", + "553 2022-02-18 15:55:03.576236+01:00 NaN \n", + "554 2022-02-18 16:16:00.199543+01:00 NaN \n", + "555 2022-02-18 16:16:00.199543+01:00 NaN \n", + "\n", + " identifier \n", + "0 21d4b0043c12b21952b0797d140991a1 \n", + "1 223b09e6c3f1f75dbf8df019af97a555 \n", + "2 ba33b7b6d225a75d713a356b49c4d915 \n", + "3 a00b61ad933518856f86e63ca91a5750 \n", + "4 7f6013803c242253a5ccde80f780984f \n", + ".. ... \n", + "551 7d888e42abe101fc8b21dc88948c8b74 \n", + "552 3cf21731c25eee650d5b232ee4780563 \n", + "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n", + "554 4c53016fc65847646f600eff853593e5 \n", + "555 90e642c0e1ef6bc9f2bc43089798de00 \n", + "\n", + "[556 rows x 6 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pricing_formulas" + ] + }, + { + "cell_type": "markdown", + "id": "46aad10f-8530-410e-872b-bb253c553a46", + "metadata": {}, + "source": [ + "# jointure entre les bases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4c3edd1-6d58-4c57-b3e4-0ef3529f6b8c", + "metadata": {}, + "outputs": [], + "source": [ + "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "eac537e1-bbad-45bc-a85c-12b675da1088", + "metadata": {}, + "outputs": [], + "source": [ + "#Merge1 entre products et pricing_formulas\n", + "base1=products.merge(pricing_formulas, how='left', left_on= 'pricing_formula_id', right_on= 'id', suffixes = (\"_products\", \"_pricing_formula\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "75be3a30-3114-432d-87d6-697533c3c871", + "metadata": {}, + "outputs": [], + "source": [ + "#Merge2 entre base1 et products_groups\n", + "base2=base1.merge(products_groups, how='left', left_on= 'id_pricing_formula', right_on= 'id', suffixes = (\"_merge2\", \"_product_group\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "34a169c6-07a8-4ac3-a9e1-d7e7461f7310", + "metadata": {}, + "outputs": [], + "source": [ + "#Merge3 entre base2 et type_of_pricing_formulas\n", + "base3=base2.merge(type_of_pricing_formulas, how='left', left_on= 'id_pricing_formula', right_on= 'pricing_formula_id', suffixes = (\"_merge3\", \"_type_of_pricing_f\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "f44f40d2-5304-4931-b7e6-fcc06b2657b6", + "metadata": {}, + "outputs": [], + "source": [ + "#Merge4 entre base3 et type_of_pricing_formulas\n", + "df_product_pricing=base3.merge(product_packs, how='left', left_on= 'product_pack_id', right_on= 'id', suffixes = (\"_merge4\", \"_product_pack\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "a28772c3-7bc1-46b4-acc8-1388dc60ec98", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_productsamountis_full_pricerepresentation_id_merge2pricing_formula_id_merge2created_at_productsupdated_at_productscategory_id_merge2apply_priceproducts_group_id...pricing_formula_idcreated_at_type_of_pricing_fupdated_at_type_of_pricing_fidentifier_merge4idname_product_packtype_ofcreated_atupdated_atidentifier_product_pack
0106829.0False9141142020-09-03 14:09:43.119798+02:002020-09-03 14:09:43.119798+02:00410.010655...114.02021-02-15 17:02:27.395376+01:002021-02-15 17:02:27.395376+01:003706121eb9f43b635bef1433c06f679c1NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
14789.5False2731312020-09-03 13:21:22.711773+02:002020-09-03 13:21:22.711773+02:0010.0471...131.02021-02-05 11:52:05.923905+01:002021-02-05 11:52:05.923905+01:000aceb248607671792298436004b952751NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
22087311.5False2751372020-09-03 14:46:33.589030+02:002020-09-03 14:46:33.589030+02:0010.020825...137.02021-02-05 11:52:05.939898+01:002021-02-05 11:52:05.939898+01:0093002d4637331edd81ffc28b6e8e89c01NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
31571428.0False8251992022-01-28 19:29:23.525722+01:002022-01-28 19:29:23.525722+01:0050.0156773...9.02021-02-05 11:52:06.107939+01:002021-02-05 11:52:06.107939+01:007d0b25bdfff9f366da8be820608c81911NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
413418.5False9932020-09-03 13:29:30.773089+02:002020-09-03 13:29:30.773089+02:0010.01175...93.02021-02-05 11:52:06.004162+01:002021-02-05 11:52:06.004162+01:001dbb0795e8f47cb75ba7cdb08c06be5f1NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
\n", + "

5 rows × 41 columns

\n", + "
" + ], + "text/plain": [ + " id_products amount is_full_price representation_id_merge2 \\\n", + "0 10682 9.0 False 914 \n", + "1 478 9.5 False 273 \n", + "2 20873 11.5 False 275 \n", + "3 157142 8.0 False 82519 \n", + "4 1341 8.5 False 9 \n", + "\n", + " pricing_formula_id_merge2 created_at_products \\\n", + "0 114 2020-09-03 14:09:43.119798+02:00 \n", + "1 131 2020-09-03 13:21:22.711773+02:00 \n", + "2 137 2020-09-03 14:46:33.589030+02:00 \n", + "3 9 2022-01-28 19:29:23.525722+01:00 \n", + "4 93 2020-09-03 13:29:30.773089+02:00 \n", + "\n", + " updated_at_products category_id_merge2 apply_price \\\n", + "0 2020-09-03 14:09:43.119798+02:00 41 0.0 \n", + "1 2020-09-03 13:21:22.711773+02:00 1 0.0 \n", + "2 2020-09-03 14:46:33.589030+02:00 1 0.0 \n", + "3 2022-01-28 19:29:23.525722+01:00 5 0.0 \n", + "4 2020-09-03 13:29:30.773089+02:00 1 0.0 \n", + "\n", + " products_group_id ... pricing_formula_id \\\n", + "0 10655 ... 114.0 \n", + "1 471 ... 131.0 \n", + "2 20825 ... 137.0 \n", + "3 156773 ... 9.0 \n", + "4 1175 ... 93.0 \n", + "\n", + " created_at_type_of_pricing_f updated_at_type_of_pricing_f \\\n", + "0 2021-02-15 17:02:27.395376+01:00 2021-02-15 17:02:27.395376+01:00 \n", + "1 2021-02-05 11:52:05.923905+01:00 2021-02-05 11:52:05.923905+01:00 \n", + "2 2021-02-05 11:52:05.939898+01:00 2021-02-05 11:52:05.939898+01:00 \n", + "3 2021-02-05 11:52:06.107939+01:00 2021-02-05 11:52:06.107939+01:00 \n", + "4 2021-02-05 11:52:06.004162+01:00 2021-02-05 11:52:06.004162+01:00 \n", + "\n", + " identifier_merge4 id name_product_pack type_of \\\n", + "0 3706121eb9f43b635bef1433c06f679c 1 NaN 0 \n", + "1 0aceb248607671792298436004b95275 1 NaN 0 \n", + "2 93002d4637331edd81ffc28b6e8e89c0 1 NaN 0 \n", + "3 7d0b25bdfff9f366da8be820608c8191 1 NaN 0 \n", + "4 1dbb0795e8f47cb75ba7cdb08c06be5f 1 NaN 0 \n", + "\n", + " created_at updated_at \\\n", + "0 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "1 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "2 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "3 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "4 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "\n", + " identifier_product_pack \n", + "0 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "1 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "2 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "3 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "4 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "\n", + "[5 rows x 41 columns]" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_product_pricing.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03442997-806f-4285-a139-3bad46bb4522", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d22a0d75-53c5-4b54-9060-c9e7c307fb13", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET = \"bdc2324-data\"\n", + "directory_path = '2'" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7c229dad-6ebd-4f43-99f1-fb330dc29466", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/2/2campaign_stats.csv',\n", + " 'bdc2324-data/2/2campaigns.csv',\n", + " 'bdc2324-data/2/2categories.csv',\n", + " 'bdc2324-data/2/2contribution_sites.csv',\n", + " 'bdc2324-data/2/2contributions.csv',\n", + " 'bdc2324-data/2/2countries.csv',\n", + " 'bdc2324-data/2/2currencies.csv',\n", + " 'bdc2324-data/2/2customer_target_mappings.csv',\n", + " 'bdc2324-data/2/2customersplus.csv',\n", + " 'bdc2324-data/2/2event_types.csv',\n", + " 'bdc2324-data/2/2events.csv',\n", + " 'bdc2324-data/2/2facilities.csv',\n", + " 'bdc2324-data/2/2link_stats.csv',\n", + " 'bdc2324-data/2/2pricing_formulas.csv',\n", + " 'bdc2324-data/2/2product_packs.csv',\n", + " 'bdc2324-data/2/2products.csv',\n", + " 'bdc2324-data/2/2products_groups.csv',\n", + " 'bdc2324-data/2/2purchases.csv',\n", + " 'bdc2324-data/2/2representation_category_capacities.csv',\n", + " 'bdc2324-data/2/2representations.csv',\n", + " 'bdc2324-data/2/2seasons.csv',\n", + " 'bdc2324-data/2/2structure_tag_mappings.csv',\n", + " 'bdc2324-data/2/2suppliers.csv',\n", + " 'bdc2324-data/2/2tags.csv',\n", + " 'bdc2324-data/2/2target_types.csv',\n", + " 'bdc2324-data/2/2targets.csv',\n", + " 'bdc2324-data/2/2tickets.csv']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "BUCKET = \"bdc2324-data/2\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "df3d3548-3d76-4f07-afa1-e240932bc1c7", + "metadata": {}, + "outputs": [], + "source": [ + "dic_base_ent2=['campaign_stats','campaigns','categories','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "90f8d5fc-43f3-4f36-b8cc-89a41785f032", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_438/673681459.py:5: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n" + ] + } + ], + "source": [ + "dic_base_ent2=['campaign_stats','campaigns','categories','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']\n", + "for nom_base in dic_base_ent2:\n", + " FILE_PATH_S3_fanta = 'bdc2324-data/2/2' + nom_base + '.csv'\n", + " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", + " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "3e39a584-e02b-41b2-831c-33b920e298e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "27" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dic_base_ent2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b6c6f65", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "def calculer_proportion_valeurs_manquantes_et_exporter(databases, fichier_sortie='proportion_valeurs_manquantes.xlsx'):\n", + " \"\"\"\n", + " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données et exporter les résultats dans un fichier Excel.\n", + "\n", + " Paramètres:\n", + " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", + " - fichier_sortie (str): Le chemin du fichier Excel de sortie.\n", + "\n", + " Retourne:\n", + " - Un fichier Excel où chaque onglet représente une base de données différente avec la proportion de valeurs manquantes pour chaque variable.\n", + " \"\"\"\n", + " with pd.ExcelWriter(fichier_sortie) as writer:\n", + " for nom_db, df in databases.items():\n", + " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", + " proportion_manquantes = df.isnull().mean()\n", + " # Convertir en DataFrame pour un meilleur affichage\n", + " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", + " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", + " # Écrire le DataFrame dans un onglet du fichier Excel\n", + " resultats_df.to_excel(writer, sheet_name=nom_db)\n", + "\n", + " print(f\"Les résultats ont été exportés dans le fichier '{fichier_sortie}'.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06759646-9419-4841-b12f-bbfceb417f3a", + "metadata": {}, + "outputs": [], + "source": [ + "#fonction calcul la proportion de valeur manquante\n", + "\n", + "import pandas as pd\n", + "\n", + "def calculer_proportion_valeurs_manquantes(databases):\n", + " \"\"\"\n", + " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données.\n", + "\n", + " Paramètres:\n", + " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", + "\n", + " Retourne:\n", + " - Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont des DataFrames avec la proportion de valeurs manquantes pour chaque variable.\n", + " \"\"\"\n", + " resultats = {}\n", + " for nom_db, df in databases.items():\n", + " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", + " proportion_manquantes = df.isnull().mean()\n", + " # Convertir en DataFrame pour un meilleur affichage\n", + " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", + " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", + " # Ajouter le résultat au dictionnaire\n", + " resultats[nom_db] = resultats_df\n", + " return resultats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0960daa8", + "metadata": {}, + "outputs": [], + "source": [ + "# Exemple d'utilisation\n", + "\n", + "databases = {'Base1': campaign_stats, 'Base2': campaigns}\n", + "\n", + "resultats = calculer_proportion_valeurs_manquantes(databases)\n", + "\n", + "for nom_db, resultat in resultats.items():\n", + " print(f\"Base de données: {nom_db}\")\n", + " print(resultat)\n", + " print()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77dc02bb", + "metadata": {}, + "outputs": [], + "source": [ + "# Exemple d'utilisation\n", + "dict={'campaign_stats': campaign_stats,\n", + " 'campaigns': campaigns,\n", + " 'categories': categories,\n", + " 'contribution_sites': contribution_sites,\n", + " 'contributions': contributions,\n", + " 'countries': countries,\n", + " 'currencies': currencies,\n", + " 'customer_target_mappings': customer_target_mappings,\n", + " 'customersplus': customersplus,\n", + " 'event_types': event_types,\n", + " 'events': events,\n", + " 'facilities': facilities,\n", + " 'link_stats': link_stats,\n", + " 'pricing_formulas': pricing_formulas,\n", + " 'product_packs': product_packs,\n", + " 'products': products,\n", + " 'products_groups': products_groups,\n", + " 'purchases': purchases,\n", + " 'representation_category_capacities': representation_category_capacities,\n", + " 'representations': representations,\n", + " 'seasons': seasons,\n", + " 'structure_tag_mappings': structure_tag_mappings,\n", + " 'suppliers': suppliers,\n", + " 'tags': tags,\n", + " 'target_types': target_types,\n", + " 'targets': targets,\n", + " 'tickets': tickets}\n", + "\n", + "resultats = calculer_proportion_valeurs_manquantes(dict)\n", + "\n", + "for nom_db, resultat in resultats.items():\n", + " print(f\"Base de données: {nom_db}\")\n", + " print(resultat)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "60be9271", + "metadata": {}, + "outputs": [], + "source": [ + "#MEME CODE mais avec l'exportation de result a en format excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "955fe358", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "def calculer_proportion_valeurs_manquantes_et_exporter(databases, fichier_sortie='proportion_valeurs_manquantes.xlsx'):\n", + " \"\"\"\n", + " Calculer la proportion de valeurs manquantes pour chaque variable dans chaque base de données et exporter les résultats dans un fichier Excel.\n", + "\n", + " Paramètres:\n", + " - databases (dict): Un dictionnaire où les clés sont les noms des bases de données et les valeurs sont les DataFrames pandas.\n", + " - fichier_sortie (str): Le chemin du fichier Excel de sortie.\n", + "\n", + " Retourne:\n", + " - Un fichier Excel où chaque onglet représente une base de données différente avec la proportion de valeurs manquantes pour chaque variable.\n", + " \"\"\"\n", + " with pd.ExcelWriter(fichier_sortie) as writer:\n", + " for nom_db, df in databases.items():\n", + " # Calculer la proportion de valeurs manquantes pour chaque colonne\n", + " proportion_manquantes = df.isnull().mean()\n", + " # Convertir en DataFrame pour un meilleur affichage\n", + " resultats_df = pd.DataFrame(proportion_manquantes, columns=['ProportionValeursManquantes'])\n", + " resultats_df['ProportionValeursManquantes'] = resultats_df['ProportionValeursManquantes'].map(lambda x: f\"{x:.2%}\")\n", + " # Écrire le DataFrame dans un onglet du fichier Excel\n", + " resultats_df.to_excel(writer, sheet_name=nom_db)\n", + "\n", + " print(f\"Les résultats ont été exportés dans le fichier '{fichier_sortie}'.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7897b689", + "metadata": {}, + "outputs": [], + "source": [ + "# Exemple d'utilisation\n", + "dict={'campaign_stats': campaign_stats,\n", + " 'campaigns': campaigns,\n", + " 'categories': categories,\n", + " 'contribution_sites': contribution_sites,\n", + " 'contributions': contributions,\n", + " 'countries': countries,\n", + " 'currencies': currencies,\n", + " 'customer_target_mappings': customer_target_mappings,\n", + " 'customersplus': customersplus,\n", + " 'event_types': event_types,\n", + " 'events': events,\n", + " 'facilities': facilities,\n", + " 'link_stats': link_stats,\n", + " 'pricing_formulas': pricing_formulas,\n", + " 'product_packs': product_packs,\n", + " 'products': products,\n", + " 'products_groups': products_groups,\n", + " 'purchases': purchases,\n", + " 'representation_category_capacities': representation_category_capacities,\n", + " 'representations': representations,\n", + " 'seasons': seasons,\n", + " 'structure_tag_mappings': structure_tag_mappings,\n", + " 'suppliers': suppliers,\n", + " 'tags': tags,\n", + " 'target_types': target_types,\n", + " 'targets': targets,\n", + " 'tickets': tickets}\n", + "\n", + "calculer_proportion_valeurs_manquantes_et_exporter(dict, 'proportion_valeurs_manquantes_ent1.xlsx')\n" + ] + }, + { + "cell_type": "markdown", + "id": "514273f4", + "metadata": {}, + "source": [ + "## Entreprise 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69b8f59a", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET = \"bdc2324-data/3\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b4c005f", + "metadata": {}, + "outputs": [], + "source": [ + "dic_base_ent3=['campaign_stats','campaigns','categories','consumptions','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aae542d6", + "metadata": {}, + "outputs": [], + "source": [ + "dic_base_ent3=['campaign_stats','campaigns','categories','consumptions','contribution_sites','contributions','countries','currencies','customer_target_mappings','customersplus','event_types','events','facilities','link_stats','pricing_formulas','product_packs','products','products_groups','purchases','representation_category_capacities','representations','seasons','structure_tag_mappings','suppliers','tags','target_types','targets','tickets']\n", + "for nom_base in dic_base_ent2:\n", + " FILE_PATH_S3_fanta = 'bdc2324-data/3/3' + nom_base + '.csv'\n", + " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", + " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4b2df76", + "metadata": {}, + "outputs": [], + "source": [ + "dict4={'campaign_stats': campaign_stats,\n", + " 'campaigns': campaigns,\n", + " 'categories': categories,\n", + " 'consumptions': consumptions,\n", + " 'contribution_sites': contribution_sites,\n", + " 'contributions': contributions,\n", + " 'countries': countries,\n", + " 'currencies': currencies,\n", + " 'customer_target_mappings': customer_target_mappings,\n", + " 'customersplus': customersplus,\n", + " 'event_types': event_types,\n", + " 'events': events,\n", + " 'facilities': facilities,\n", + " 'link_stats': link_stats,\n", + " 'pricing_formulas': pricing_formulas,\n", + " 'product_packs': product_packs,\n", + " 'products': products,\n", + " 'products_groups': products_groups,\n", + " 'purchases': purchases,\n", + " 'representation_category_capacities': representation_category_capacities,\n", + " 'representations': representations,\n", + " 'seasons': seasons,\n", + " 'structure_tag_mappings': structure_tag_mappings,\n", + " 'suppliers': suppliers,\n", + " 'tags': tags,\n", + " 'target_types': target_types,\n", + " 'targets': targets,\n", + " 'tickets': tickets}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}