{ "cells": [ { "cell_type": "markdown", "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "88af2795-8bf9-4df0-a059-be7c28fb4289", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/1',\n", " 'bdc2324-data/10',\n", " 'bdc2324-data/101',\n", " 'bdc2324-data/11',\n", " 'bdc2324-data/12',\n", " 'bdc2324-data/13',\n", " 'bdc2324-data/14',\n", " 'bdc2324-data/2',\n", " 'bdc2324-data/3',\n", " 'bdc2324-data/4',\n", " 'bdc2324-data/5',\n", " 'bdc2324-data/6',\n", " 'bdc2324-data/7',\n", " 'bdc2324-data/8',\n", " 'bdc2324-data/9']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import s3fs\n", "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "BUCKET = \"bdc2324-data\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 3, "id": "d60f6b27-00b4-4655-9325-79169d1e68df", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "bdc2324-data/1\n", "['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1categories.csv', 'bdc2324-data/1/1countries.csv', 'bdc2324-data/1/1currencies.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1facilities.csv', 'bdc2324-data/1/1link_stats.csv', 'bdc2324-data/1/1pricing_formulas.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1representation_category_capacities.csv', 'bdc2324-data/1/1representations.csv', 'bdc2324-data/1/1seasons.csv', 'bdc2324-data/1/1structure_tag_mappings.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tags.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_of_categories.csv', 'bdc2324-data/1/1type_of_pricing_formulas.csv', 'bdc2324-data/1/1type_ofs.csv']\n", "bdc2324-data/2\n", "['bdc2324-data/2/2campaign_stats.csv', 'bdc2324-data/2/2campaigns.csv', 'bdc2324-data/2/2categories.csv', 'bdc2324-data/2/2contribution_sites.csv', 'bdc2324-data/2/2contributions.csv', 'bdc2324-data/2/2countries.csv', 'bdc2324-data/2/2currencies.csv', 'bdc2324-data/2/2customer_target_mappings.csv', 'bdc2324-data/2/2customersplus.csv', 'bdc2324-data/2/2event_types.csv', 'bdc2324-data/2/2events.csv', 'bdc2324-data/2/2facilities.csv', 'bdc2324-data/2/2link_stats.csv', 'bdc2324-data/2/2pricing_formulas.csv', 'bdc2324-data/2/2product_packs.csv', 'bdc2324-data/2/2products.csv', 'bdc2324-data/2/2products_groups.csv', 'bdc2324-data/2/2purchases.csv', 'bdc2324-data/2/2representation_category_capacities.csv', 'bdc2324-data/2/2representations.csv', 'bdc2324-data/2/2seasons.csv', 'bdc2324-data/2/2structure_tag_mappings.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/2/2tags.csv', 'bdc2324-data/2/2target_types.csv', 'bdc2324-data/2/2targets.csv', 'bdc2324-data/2/2tickets.csv']\n", "bdc2324-data/3\n", "['bdc2324-data/3/3campaign_stats.csv', 'bdc2324-data/3/3campaigns.csv', 'bdc2324-data/3/3categories.csv', 'bdc2324-data/3/3consumptions.csv', 'bdc2324-data/3/3contribution_sites.csv', 'bdc2324-data/3/3contributions.csv', 'bdc2324-data/3/3countries.csv', 'bdc2324-data/3/3currencies.csv', 'bdc2324-data/3/3customer_target_mappings.csv', 'bdc2324-data/3/3customersplus.csv', 'bdc2324-data/3/3event_types.csv', 'bdc2324-data/3/3events.csv', 'bdc2324-data/3/3facilities.csv', 'bdc2324-data/3/3link_stats.csv', 'bdc2324-data/3/3pricing_formulas.csv', 'bdc2324-data/3/3product_packs.csv', 'bdc2324-data/3/3products.csv', 'bdc2324-data/3/3products_groups.csv', 'bdc2324-data/3/3purchases.csv', 'bdc2324-data/3/3representation_category_capacities.csv', 'bdc2324-data/3/3representations.csv', 'bdc2324-data/3/3seasons.csv', 'bdc2324-data/3/3structure_tag_mappings.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/3/3tags.csv', 'bdc2324-data/3/3target_types.csv', 'bdc2324-data/3/3targets.csv', 'bdc2324-data/3/3tickets.csv']\n", "bdc2324-data/4\n", "['bdc2324-data/4/4campaign_stats.csv', 'bdc2324-data/4/4campaigns.csv', 'bdc2324-data/4/4categories.csv', 'bdc2324-data/4/4contribution_sites.csv', 'bdc2324-data/4/4contributions.csv', 'bdc2324-data/4/4countries.csv', 'bdc2324-data/4/4currencies.csv', 'bdc2324-data/4/4customer_target_mappings.csv', 'bdc2324-data/4/4customersplus.csv', 'bdc2324-data/4/4event_types.csv', 'bdc2324-data/4/4events.csv', 'bdc2324-data/4/4facilities.csv', 'bdc2324-data/4/4link_stats.csv', 'bdc2324-data/4/4pricing_formulas.csv', 'bdc2324-data/4/4product_packs.csv', 'bdc2324-data/4/4products.csv', 'bdc2324-data/4/4products_groups.csv', 'bdc2324-data/4/4purchases.csv', 'bdc2324-data/4/4representation_category_capacities.csv', 'bdc2324-data/4/4representations.csv', 'bdc2324-data/4/4seasons.csv', 'bdc2324-data/4/4structure_tag_mappings.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/4/4tags.csv', 'bdc2324-data/4/4target_types.csv', 'bdc2324-data/4/4targets.csv', 'bdc2324-data/4/4tickets.csv', 'bdc2324-data/4/4type_of_pricing_formulas.csv', 'bdc2324-data/4/4type_ofs.csv']\n", "bdc2324-data/5\n", "['bdc2324-data/5/5campaign_stats.csv', 'bdc2324-data/5/5campaigns.csv', 'bdc2324-data/5/5categories.csv', 'bdc2324-data/5/5consumptions.csv', 'bdc2324-data/5/5countries.csv', 'bdc2324-data/5/5currencies.csv', 'bdc2324-data/5/5customer_target_mappings.csv', 'bdc2324-data/5/5customersplus.csv', 'bdc2324-data/5/5event_types.csv', 'bdc2324-data/5/5events.csv', 'bdc2324-data/5/5facilities.csv', 'bdc2324-data/5/5link_stats.csv', 'bdc2324-data/5/5pricing_formulas.csv', 'bdc2324-data/5/5product_packs.csv', 'bdc2324-data/5/5products.csv', 'bdc2324-data/5/5products_groups.csv', 'bdc2324-data/5/5purchases.csv', 'bdc2324-data/5/5representation_category_capacities.csv', 'bdc2324-data/5/5representations.csv', 'bdc2324-data/5/5seasons.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/5/5target_types.csv', 'bdc2324-data/5/5targets.csv', 'bdc2324-data/5/5tickets.csv']\n", "bdc2324-data/6\n", "['bdc2324-data/6/6campaign_stats.csv', 'bdc2324-data/6/6campaigns.csv', 'bdc2324-data/6/6categories.csv', 'bdc2324-data/6/6consumptions.csv', 'bdc2324-data/6/6countries.csv', 'bdc2324-data/6/6currencies.csv', 'bdc2324-data/6/6customer_target_mappings.csv', 'bdc2324-data/6/6customersplus.csv', 'bdc2324-data/6/6event_types.csv', 'bdc2324-data/6/6events.csv', 'bdc2324-data/6/6facilities.csv', 'bdc2324-data/6/6link_stats.csv', 'bdc2324-data/6/6pricing_formulas.csv', 'bdc2324-data/6/6product_packs.csv', 'bdc2324-data/6/6products.csv', 'bdc2324-data/6/6products_groups.csv', 'bdc2324-data/6/6purchases.csv', 'bdc2324-data/6/6representation_category_capacities.csv', 'bdc2324-data/6/6representations.csv', 'bdc2324-data/6/6seasons.csv', 'bdc2324-data/6/6structure_tag_mappings.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/6/6tags.csv', 'bdc2324-data/6/6target_types.csv', 'bdc2324-data/6/6targets.csv', 'bdc2324-data/6/6tickets.csv', 'bdc2324-data/6/6type_of_pricing_formulas.csv', 'bdc2324-data/6/6type_ofs.csv']\n", "bdc2324-data/7\n", "['bdc2324-data/7/7campaign_stats.csv', 'bdc2324-data/7/7campaigns.csv', 'bdc2324-data/7/7categories.csv', 'bdc2324-data/7/7consumptions.csv', 'bdc2324-data/7/7countries.csv', 'bdc2324-data/7/7currencies.csv', 'bdc2324-data/7/7customer_target_mappings.csv', 'bdc2324-data/7/7customersplus.csv', 'bdc2324-data/7/7event_types.csv', 'bdc2324-data/7/7events.csv', 'bdc2324-data/7/7facilities.csv', 'bdc2324-data/7/7link_stats.csv', 'bdc2324-data/7/7pricing_formulas.csv', 'bdc2324-data/7/7product_packs.csv', 'bdc2324-data/7/7products.csv', 'bdc2324-data/7/7products_groups.csv', 'bdc2324-data/7/7purchases.csv', 'bdc2324-data/7/7representation_category_capacities.csv', 'bdc2324-data/7/7representation_types.csv', 'bdc2324-data/7/7representations.csv', 'bdc2324-data/7/7seasons.csv', 'bdc2324-data/7/7structure_tag_mappings.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/7/7tags.csv', 'bdc2324-data/7/7target_types.csv', 'bdc2324-data/7/7targets.csv', 'bdc2324-data/7/7tickets.csv', 'bdc2324-data/7/7type_of_categories.csv', 'bdc2324-data/7/7type_of_pricing_formulas.csv', 'bdc2324-data/7/7type_ofs.csv']\n", "bdc2324-data/8\n", "['bdc2324-data/8/8campaign_stats.csv', 'bdc2324-data/8/8campaigns.csv', 'bdc2324-data/8/8categories.csv', 'bdc2324-data/8/8countries.csv', 'bdc2324-data/8/8currencies.csv', 'bdc2324-data/8/8customer_target_mappings.csv', 'bdc2324-data/8/8customersplus.csv', 'bdc2324-data/8/8event_types.csv', 'bdc2324-data/8/8events.csv', 'bdc2324-data/8/8facilities.csv', 'bdc2324-data/8/8link_stats.csv', 'bdc2324-data/8/8pricing_formulas.csv', 'bdc2324-data/8/8product_packs.csv', 'bdc2324-data/8/8products.csv', 'bdc2324-data/8/8products_groups.csv', 'bdc2324-data/8/8purchases.csv', 'bdc2324-data/8/8representation_category_capacities.csv', 'bdc2324-data/8/8representations.csv', 'bdc2324-data/8/8seasons.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/8/8target_types.csv', 'bdc2324-data/8/8targets.csv', 'bdc2324-data/8/8tickets.csv', 'bdc2324-data/8/8type_of_categories.csv', 'bdc2324-data/8/8type_of_pricing_formulas.csv', 'bdc2324-data/8/8type_ofs.csv']\n", "bdc2324-data/9\n", "['bdc2324-data/9/9campaign_stats.csv', 'bdc2324-data/9/9campaigns.csv', 'bdc2324-data/9/9categories.csv', 'bdc2324-data/9/9countries.csv', 'bdc2324-data/9/9currencies.csv', 'bdc2324-data/9/9customer_target_mappings.csv', 'bdc2324-data/9/9customersplus.csv', 'bdc2324-data/9/9event_types.csv', 'bdc2324-data/9/9events.csv', 'bdc2324-data/9/9facilities.csv', 'bdc2324-data/9/9link_stats.csv', 'bdc2324-data/9/9pricing_formulas.csv', 'bdc2324-data/9/9product_packs.csv', 'bdc2324-data/9/9products.csv', 'bdc2324-data/9/9products_groups.csv', 'bdc2324-data/9/9purchases.csv', 'bdc2324-data/9/9representation_category_capacities.csv', 'bdc2324-data/9/9representations.csv', 'bdc2324-data/9/9seasons.csv', 'bdc2324-data/9/9suppliers.csv', 'bdc2324-data/9/9target_types.csv', 'bdc2324-data/9/9targets.csv', 'bdc2324-data/9/9tickets.csv']\n", "bdc2324-data/10\n", "['bdc2324-data/10/10campaign_stats.csv', 'bdc2324-data/10/10campaigns.csv', 'bdc2324-data/10/10categories.csv', 'bdc2324-data/10/10countries.csv', 'bdc2324-data/10/10currencies.csv', 'bdc2324-data/10/10customer_target_mappings.csv', 'bdc2324-data/10/10customersplus.csv', 'bdc2324-data/10/10event_types.csv', 'bdc2324-data/10/10events.csv', 'bdc2324-data/10/10facilities.csv', 'bdc2324-data/10/10link_stats.csv', 'bdc2324-data/10/10pricing_formulas.csv', 'bdc2324-data/10/10product_packs.csv', 'bdc2324-data/10/10products.csv', 'bdc2324-data/10/10products_groups.csv', 'bdc2324-data/10/10purchases.csv', 'bdc2324-data/10/10representation_category_capacities.csv', 'bdc2324-data/10/10representation_types.csv', 'bdc2324-data/10/10representations.csv', 'bdc2324-data/10/10seasons.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/10/10tags.csv', 'bdc2324-data/10/10target_types.csv', 'bdc2324-data/10/10targets.csv', 'bdc2324-data/10/10tickets.csv', 'bdc2324-data/10/10type_of_pricing_formulas.csv', 'bdc2324-data/10/10type_ofs.csv']\n", "bdc2324-data/11\n", "['bdc2324-data/11/11campaign_stats.csv', 'bdc2324-data/11/11campaigns.csv', 'bdc2324-data/11/11categories.csv', 'bdc2324-data/11/11countries.csv', 'bdc2324-data/11/11currencies.csv', 'bdc2324-data/11/11customer_target_mappings.csv', 'bdc2324-data/11/11customersplus.csv', 'bdc2324-data/11/11event_types.csv', 'bdc2324-data/11/11events.csv', 'bdc2324-data/11/11facilities.csv', 'bdc2324-data/11/11link_stats.csv', 'bdc2324-data/11/11pricing_formulas.csv', 'bdc2324-data/11/11product_packs.csv', 'bdc2324-data/11/11products.csv', 'bdc2324-data/11/11products_groups.csv', 'bdc2324-data/11/11purchases.csv', 'bdc2324-data/11/11representation_category_capacities.csv', 'bdc2324-data/11/11representations.csv', 'bdc2324-data/11/11seasons.csv', 'bdc2324-data/11/11structure_tag_mappings.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/11/11tags.csv', 'bdc2324-data/11/11target_types.csv', 'bdc2324-data/11/11targets.csv', 'bdc2324-data/11/11tickets.csv']\n", "bdc2324-data/12\n", "['bdc2324-data/12/12campaign_stats.csv', 'bdc2324-data/12/12campaigns.csv', 'bdc2324-data/12/12categories.csv', 'bdc2324-data/12/12consumptions.csv', 'bdc2324-data/12/12countries.csv', 'bdc2324-data/12/12currencies.csv', 'bdc2324-data/12/12customer_target_mappings.csv', 'bdc2324-data/12/12customersplus.csv', 'bdc2324-data/12/12event_types.csv', 'bdc2324-data/12/12events.csv', 'bdc2324-data/12/12facilities.csv', 'bdc2324-data/12/12link_stats.csv', 'bdc2324-data/12/12pricing_formulas.csv', 'bdc2324-data/12/12product_packs.csv', 'bdc2324-data/12/12products.csv', 'bdc2324-data/12/12products_groups.csv', 'bdc2324-data/12/12purchases.csv', 'bdc2324-data/12/12representation_category_capacities.csv', 'bdc2324-data/12/12representations.csv', 'bdc2324-data/12/12seasons.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/12/12target_types.csv', 'bdc2324-data/12/12targets.csv', 'bdc2324-data/12/12tickets.csv', 'bdc2324-data/12/12type_ofs.csv']\n", "bdc2324-data/13\n", "['bdc2324-data/13/13campaign_stats.csv', 'bdc2324-data/13/13campaigns.csv', 'bdc2324-data/13/13categories.csv', 'bdc2324-data/13/13countries.csv', 'bdc2324-data/13/13currencies.csv', 'bdc2324-data/13/13customer_target_mappings.csv', 'bdc2324-data/13/13customersplus.csv', 'bdc2324-data/13/13event_types.csv', 'bdc2324-data/13/13events.csv', 'bdc2324-data/13/13facilities.csv', 'bdc2324-data/13/13link_stats.csv', 'bdc2324-data/13/13pricing_formulas.csv', 'bdc2324-data/13/13product_packs.csv', 'bdc2324-data/13/13products.csv', 'bdc2324-data/13/13products_groups.csv', 'bdc2324-data/13/13purchases.csv', 'bdc2324-data/13/13representation_category_capacities.csv', 'bdc2324-data/13/13representation_types.csv', 'bdc2324-data/13/13representations.csv', 'bdc2324-data/13/13seasons.csv', 'bdc2324-data/13/13structure_tag_mappings.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/13/13tags.csv', 'bdc2324-data/13/13target_types.csv', 'bdc2324-data/13/13targets.csv', 'bdc2324-data/13/13tickets.csv']\n", "bdc2324-data/14\n", "['bdc2324-data/14/14campaign_stats.csv', 'bdc2324-data/14/14campaigns.csv', 'bdc2324-data/14/14categories.csv', 'bdc2324-data/14/14countries.csv', 'bdc2324-data/14/14currencies.csv', 'bdc2324-data/14/14customer_target_mappings.csv', 'bdc2324-data/14/14customersplus.csv', 'bdc2324-data/14/14event_types.csv', 'bdc2324-data/14/14events.csv', 'bdc2324-data/14/14facilities.csv', 'bdc2324-data/14/14link_stats.csv', 'bdc2324-data/14/14pricing_formulas.csv', 'bdc2324-data/14/14product_packs.csv', 'bdc2324-data/14/14products.csv', 'bdc2324-data/14/14products_groups.csv', 'bdc2324-data/14/14purchases.csv', 'bdc2324-data/14/14representation_category_capacities.csv', 'bdc2324-data/14/14representation_types.csv', 'bdc2324-data/14/14representations.csv', 'bdc2324-data/14/14seasons.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/14/14target_types.csv', 'bdc2324-data/14/14targets.csv', 'bdc2324-data/14/14tickets.csv', 'bdc2324-data/14/14type_of_categories.csv', 'bdc2324-data/14/14type_of_pricing_formulas.csv', 'bdc2324-data/14/14type_ofs.csv']\n", "bdc2324-data/101\n", "['bdc2324-data/101/101campaign_stats.csv', 'bdc2324-data/101/101campaigns.csv', 'bdc2324-data/101/101categories.csv', 'bdc2324-data/101/101contribution_sites.csv', 'bdc2324-data/101/101contributions.csv', 'bdc2324-data/101/101countries.csv', 'bdc2324-data/101/101currencies.csv', 'bdc2324-data/101/101customer_target_mappings.csv', 'bdc2324-data/101/101customersplus.csv', 'bdc2324-data/101/101event_types.csv', 'bdc2324-data/101/101events.csv', 'bdc2324-data/101/101facilities.csv', 'bdc2324-data/101/101link_stats.csv', 'bdc2324-data/101/101pricing_formulas.csv', 'bdc2324-data/101/101product_packs.csv', 'bdc2324-data/101/101products.csv', 'bdc2324-data/101/101products_groups.csv', 'bdc2324-data/101/101purchases.csv', 'bdc2324-data/101/101representation_category_capacities.csv', 'bdc2324-data/101/101representations.csv', 'bdc2324-data/101/101seasons.csv', 'bdc2324-data/101/101structure_tag_mappings.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/101/101tags.csv', 'bdc2324-data/101/101target_types.csv', 'bdc2324-data/101/101targets.csv', 'bdc2324-data/101/101tickets.csv', 'bdc2324-data/101/101tickets_1.csv', 'bdc2324-data/101/101type_of_pricing_formulas.csv', 'bdc2324-data/101/101type_ofs.csv']\n" ] } ], "source": [ "# Liste des jeu de données par dossier\n", "for i in range(1, 15):\n", " FILE_PATH_S3 = BUCKET + \"/\" + str(i)\n", " print(FILE_PATH_S3)\n", " print(fs.ls(FILE_PATH_S3))\n", "print(BUCKET + \"/101\")\n", "print(fs.ls(BUCKET + \"/101\"))" ] }, { "cell_type": "code", "execution_count": 32, "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763", "metadata": {}, "outputs": [], "source": [ "# Chargement des fichiers campaign_stats.csv\n", "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 34, "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56", "metadata": {}, "outputs": [], "source": [ "# Conversion des dates 'sent_at'\n", "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" ] }, { "cell_type": "code", "execution_count": 35, "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2023-11-09 18:10:45+00:00\n", "2020-06-02 08:24:08+00:00\n", "2023-10-12 01:39:48+00:00\n", "2023-10-10 17:06:29+00:00\n", "2023-11-01 09:20:48+00:00\n", "2021-03-31 14:59:02+00:00\n" ] } ], "source": [ "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", "print(campaign_stats_1['sent_at'].max())\n", "print(campaign_stats_1['sent_at'].min())\n", "\n", "print(campaign_stats_2['sent_at'].max())\n", "print(campaign_stats_2['sent_at'].min())\n", "\n", "print(campaign_stats_3['sent_at'].max())\n", "print(campaign_stats_3['sent_at'].min())" ] }, { "cell_type": "code", "execution_count": 26, "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2021-03-28 18:01:09+02:00\n", "1 2021-03-28 18:01:09+02:00\n", "2 2021-03-28 18:00:59+02:00\n", "3 2021-03-28 18:00:59+02:00\n", "4 2021-03-28 18:01:06+02:00\n", " ... \n", "6214803 2023-10-23 11:32:33+02:00\n", "6214804 2023-10-23 11:32:49+02:00\n", "6214805 2023-10-23 11:33:28+02:00\n", "6214806 2023-10-23 11:31:53+02:00\n", "6214807 2023-10-23 11:33:54+02:00\n", "Name: sent_at, Length: 6214808, dtype: object" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "campaign_stats_1['sent_at']" ] }, { "cell_type": "markdown", "id": "31f2edbf-5661-4516-9835-06d4da615c13", "metadata": {}, "source": [ "### Customersplus.csv" ] }, { "cell_type": "code", "execution_count": 50, "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_426/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", " customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 39, "id": "460f853a-68c0-42a7-9877-b83d3aaec813", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", " 'average_purchase_delay', 'average_price_basket',\n", " 'average_ticket_basket', 'total_price', 'preferred_category',\n", " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", " 'tenant_id'],\n", " dtype='object')" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customers_plus.columns" ] }, { "cell_type": "code", "execution_count": 43, "id": "d5a9398f-72fc-4548-9f53-b20b372144b2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(151866, 43)" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customers_plus.shape" ] }, { "cell_type": "code", "execution_count": 52, "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "151866" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customers_plus_1['id'].nunique()" ] }, { "cell_type": "code", "execution_count": 53, "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "275622" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customers_plus_2['id'].nunique()" ] }, { "cell_type": "code", "execution_count": 55, "id": "b40a653e-013f-48d0-8b57-0284587b36c5", "metadata": {}, "outputs": [], "source": [ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" ] }, { "cell_type": "code", "execution_count": 61, "id": "32fa2215-3c79-40b5-8643-755865959fc7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", "# Exemple id commun = caractéristiques communes\n", "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", "\n", "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" ] }, { "cell_type": "code", "execution_count": 49, "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id 0.000000\n", "lastname 43.461341\n", "firstname 44.995588\n", "birthdate 96.419870\n", "email 8.622075\n", "street_id 0.000000\n", "created_at 0.000000\n", "updated_at 0.000000\n", "civility 100.000000\n", "is_partner 0.000000\n", "extra 100.000000\n", "deleted_at 100.000000\n", "reference 100.000000\n", "gender 0.000000\n", "is_email_true 0.000000\n", "extra_field 100.000000\n", "identifier 0.000000\n", "opt_in 0.000000\n", "structure_id 88.072380\n", "note 99.403421\n", "profession 95.913503\n", "language 99.280945\n", "mcp_contact_id 34.876141\n", "need_reload 0.000000\n", "last_buying_date 51.653431\n", "max_price 51.653431\n", "ticket_sum 0.000000\n", "average_price 8.639195\n", "fidelity 0.000000\n", "average_purchase_delay 51.653431\n", "average_price_basket 51.653431\n", "average_ticket_basket 51.653431\n", "total_price 43.014236\n", "preferred_category 100.000000\n", "preferred_supplier 100.000000\n", "preferred_formula 100.000000\n", "purchase_count 0.000000\n", "first_buying_date 51.653431\n", "last_visiting_date 100.000000\n", "zipcode 71.176564\n", "country 5.459418\n", "age 96.419870\n", "tenant_id 0.000000\n", "dtype: float64\n" ] } ], "source": [ "print(customers_plus.isna().mean()*100)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }