BDC-team-1/Notebook_AJ.ipynb

443 lines
27 KiB
Plaintext
Raw Normal View History

2023-12-22 09:58:14 +01:00
{
"cells": [
2023-12-22 13:44:48 +01:00
{
"cell_type": "markdown",
"id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
2023-12-22 09:58:14 +01:00
{
"cell_type": "code",
"execution_count": 1,
2023-12-24 15:29:39 +01:00
"id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
2023-12-22 09:58:14 +01:00
"id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
"metadata": {},
"outputs": [
{
2023-12-24 20:00:28 +01:00
"data": {
"text/plain": [
"['bdc2324-data/1',\n",
" 'bdc2324-data/10',\n",
" 'bdc2324-data/101',\n",
" 'bdc2324-data/11',\n",
" 'bdc2324-data/12',\n",
" 'bdc2324-data/13',\n",
" 'bdc2324-data/14',\n",
" 'bdc2324-data/2',\n",
" 'bdc2324-data/3',\n",
" 'bdc2324-data/4',\n",
" 'bdc2324-data/5',\n",
" 'bdc2324-data/6',\n",
" 'bdc2324-data/7',\n",
" 'bdc2324-data/8',\n",
" 'bdc2324-data/9']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
2023-12-22 09:58:14 +01:00
}
],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"bdc2324-data\"\n",
"fs.ls(BUCKET)"
]
2023-12-24 15:29:39 +01:00
},
{
"cell_type": "code",
2023-12-24 20:00:28 +01:00
"execution_count": 3,
2023-12-24 15:29:39 +01:00
"id": "d60f6b27-00b4-4655-9325-79169d1e68df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bdc2324-data/1\n",
"['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1categories.csv', 'bdc2324-data/1/1countries.csv', 'bdc2324-data/1/1currencies.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1facilities.csv', 'bdc2324-data/1/1link_stats.csv', 'bdc2324-data/1/1pricing_formulas.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1representation_category_capacities.csv', 'bdc2324-data/1/1representations.csv', 'bdc2324-data/1/1seasons.csv', 'bdc2324-data/1/1structure_tag_mappings.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tags.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_of_categories.csv', 'bdc2324-data/1/1type_of_pricing_formulas.csv', 'bdc2324-data/1/1type_ofs.csv']\n",
"bdc2324-data/2\n",
"['bdc2324-data/2/2campaign_stats.csv', 'bdc2324-data/2/2campaigns.csv', 'bdc2324-data/2/2categories.csv', 'bdc2324-data/2/2contribution_sites.csv', 'bdc2324-data/2/2contributions.csv', 'bdc2324-data/2/2countries.csv', 'bdc2324-data/2/2currencies.csv', 'bdc2324-data/2/2customer_target_mappings.csv', 'bdc2324-data/2/2customersplus.csv', 'bdc2324-data/2/2event_types.csv', 'bdc2324-data/2/2events.csv', 'bdc2324-data/2/2facilities.csv', 'bdc2324-data/2/2link_stats.csv', 'bdc2324-data/2/2pricing_formulas.csv', 'bdc2324-data/2/2product_packs.csv', 'bdc2324-data/2/2products.csv', 'bdc2324-data/2/2products_groups.csv', 'bdc2324-data/2/2purchases.csv', 'bdc2324-data/2/2representation_category_capacities.csv', 'bdc2324-data/2/2representations.csv', 'bdc2324-data/2/2seasons.csv', 'bdc2324-data/2/2structure_tag_mappings.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/2/2tags.csv', 'bdc2324-data/2/2target_types.csv', 'bdc2324-data/2/2targets.csv', 'bdc2324-data/2/2tickets.csv']\n",
"bdc2324-data/3\n",
"['bdc2324-data/3/3campaign_stats.csv', 'bdc2324-data/3/3campaigns.csv', 'bdc2324-data/3/3categories.csv', 'bdc2324-data/3/3consumptions.csv', 'bdc2324-data/3/3contribution_sites.csv', 'bdc2324-data/3/3contributions.csv', 'bdc2324-data/3/3countries.csv', 'bdc2324-data/3/3currencies.csv', 'bdc2324-data/3/3customer_target_mappings.csv', 'bdc2324-data/3/3customersplus.csv', 'bdc2324-data/3/3event_types.csv', 'bdc2324-data/3/3events.csv', 'bdc2324-data/3/3facilities.csv', 'bdc2324-data/3/3link_stats.csv', 'bdc2324-data/3/3pricing_formulas.csv', 'bdc2324-data/3/3product_packs.csv', 'bdc2324-data/3/3products.csv', 'bdc2324-data/3/3products_groups.csv', 'bdc2324-data/3/3purchases.csv', 'bdc2324-data/3/3representation_category_capacities.csv', 'bdc2324-data/3/3representations.csv', 'bdc2324-data/3/3seasons.csv', 'bdc2324-data/3/3structure_tag_mappings.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/3/3tags.csv', 'bdc2324-data/3/3target_types.csv', 'bdc2324-data/3/3targets.csv', 'bdc2324-data/3/3tickets.csv']\n",
"bdc2324-data/4\n",
"['bdc2324-data/4/4campaign_stats.csv', 'bdc2324-data/4/4campaigns.csv', 'bdc2324-data/4/4categories.csv', 'bdc2324-data/4/4contribution_sites.csv', 'bdc2324-data/4/4contributions.csv', 'bdc2324-data/4/4countries.csv', 'bdc2324-data/4/4currencies.csv', 'bdc2324-data/4/4customer_target_mappings.csv', 'bdc2324-data/4/4customersplus.csv', 'bdc2324-data/4/4event_types.csv', 'bdc2324-data/4/4events.csv', 'bdc2324-data/4/4facilities.csv', 'bdc2324-data/4/4link_stats.csv', 'bdc2324-data/4/4pricing_formulas.csv', 'bdc2324-data/4/4product_packs.csv', 'bdc2324-data/4/4products.csv', 'bdc2324-data/4/4products_groups.csv', 'bdc2324-data/4/4purchases.csv', 'bdc2324-data/4/4representation_category_capacities.csv', 'bdc2324-data/4/4representations.csv', 'bdc2324-data/4/4seasons.csv', 'bdc2324-data/4/4structure_tag_mappings.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/4/4tags.csv', 'bdc2324-data/4/4target_types.csv', 'bdc2324-data/4/4targets.csv', 'bdc2324-data/4/4tickets.csv', 'bdc2324-data/4/4type_of_pricing_formulas.csv', 'bdc2324-data/4/4type_ofs.csv']\n",
"bdc2324-data/5\n",
"['bdc2324-data/5/5campaign_stats.csv', 'bdc2324-data/5/5campaigns.csv', 'bdc2324-data/5/5categories.csv', 'bdc2324-data/5/5consumptions.csv', 'bdc2324-data/5/5countries.csv', 'bdc2324-data/5/5currencies.csv', 'bdc2324-data/5/5customer_target_mappings.csv', 'bdc2324-data/5/5customersplus.csv', 'bdc2324-data/5/5event_types.csv', 'bdc2324-data/5/5events.csv', 'bdc2324-data/5/5facilities.csv', 'bdc2324-data/5/5link_stats.csv', 'bdc2324-data/5/5pricing_formulas.csv', 'bdc2324-data/5/5product_packs.csv', 'bdc2324-data/5/5products.csv', 'bdc2324-data/5/5products_groups.csv', 'bdc2324-data/5/5purchases.csv', 'bdc2324-data/5/5representation_category_capacities.csv', 'bdc2324-data/5/5representations.csv', 'bdc2324-data/5/5seasons.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/5/5target_types.csv', 'bdc2324-data/5/5targets.csv', 'bdc2324-data/5/5tickets.csv']\n",
"bdc2324-data/6\n",
"['bdc2324-data/6/6campaign_stats.csv', 'bdc2324-data/6/6campaigns.csv', 'bdc2324-data/6/6categories.csv', 'bdc2324-data/6/6consumptions.csv', 'bdc2324-data/6/6countries.csv', 'bdc2324-data/6/6currencies.csv', 'bdc2324-data/6/6customer_target_mappings.csv', 'bdc2324-data/6/6customersplus.csv', 'bdc2324-data/6/6event_types.csv', 'bdc2324-data/6/6events.csv', 'bdc2324-data/6/6facilities.csv', 'bdc2324-data/6/6link_stats.csv', 'bdc2324-data/6/6pricing_formulas.csv', 'bdc2324-data/6/6product_packs.csv', 'bdc2324-data/6/6products.csv', 'bdc2324-data/6/6products_groups.csv', 'bdc2324-data/6/6purchases.csv', 'bdc2324-data/6/6representation_category_capacities.csv', 'bdc2324-data/6/6representations.csv', 'bdc2324-data/6/6seasons.csv', 'bdc2324-data/6/6structure_tag_mappings.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/6/6tags.csv', 'bdc2324-data/6/6target_types.csv', 'bdc2324-data/6/6targets.csv', 'bdc2324-data/6/6tickets.csv', 'bdc2324-data/6/6type_of_pricing_formulas.csv', 'bdc2324-data/6/6type_ofs.csv']\n",
"bdc2324-data/7\n",
"['bdc2324-data/7/7campaign_stats.csv', 'bdc2324-data/7/7campaigns.csv', 'bdc2324-data/7/7categories.csv', 'bdc2324-data/7/7consumptions.csv', 'bdc2324-data/7/7countries.csv', 'bdc2324-data/7/7currencies.csv', 'bdc2324-data/7/7customer_target_mappings.csv', 'bdc2324-data/7/7customersplus.csv', 'bdc2324-data/7/7event_types.csv', 'bdc2324-data/7/7events.csv', 'bdc2324-data/7/7facilities.csv', 'bdc2324-data/7/7link_stats.csv', 'bdc2324-data/7/7pricing_formulas.csv', 'bdc2324-data/7/7product_packs.csv', 'bdc2324-data/7/7products.csv', 'bdc2324-data/7/7products_groups.csv', 'bdc2324-data/7/7purchases.csv', 'bdc2324-data/7/7representation_category_capacities.csv', 'bdc2324-data/7/7representation_types.csv', 'bdc2324-data/7/7representations.csv', 'bdc2324-data/7/7seasons.csv', 'bdc2324-data/7/7structure_tag_mappings.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/7/7tags.csv', 'bdc2324-data/7/7target_types.csv', 'bdc2324-data/7/7targets.csv', 'bdc2324-data/7/7tickets.csv', 'bdc2324-data/7/7type_of_categories.csv', 'bdc2324-data/7/7type_of_pricing_formulas.csv', 'bdc2324-data/7/7type_ofs.csv']\n",
"bdc2324-data/8\n",
"['bdc2324-data/8/8campaign_stats.csv', 'bdc2324-data/8/8campaigns.csv', 'bdc2324-data/8/8categories.csv', 'bdc2324-data/8/8countries.csv', 'bdc2324-data/8/8currencies.csv', 'bdc2324-data/8/8customer_target_mappings.csv', 'bdc2324-data/8/8customersplus.csv', 'bdc2324-data/8/8event_types.csv', 'bdc2324-data/8/8events.csv', 'bdc2324-data/8/8facilities.csv', 'bdc2324-data/8/8link_stats.csv', 'bdc2324-data/8/8pricing_formulas.csv', 'bdc2324-data/8/8product_packs.csv', 'bdc2324-data/8/8products.csv', 'bdc2324-data/8/8products_groups.csv', 'bdc2324-data/8/8purchases.csv', 'bdc2324-data/8/8representation_category_capacities.csv', 'bdc2324-data/8/8representations.csv', 'bdc2324-data/8/8seasons.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/8/8target_types.csv', 'bdc2324-data/8/8targets.csv', 'bdc2324-data/8/8tickets.csv', 'bdc2324-data/8/8type_of_categories.csv', 'bdc2324-data/8/8type_of_pricing_formulas.csv', 'bdc2324-data/8/8type_ofs.csv']\n",
"bdc2324-data/9\n",
"['bdc2324-data/9/9campaign_stats.csv', 'bdc2324-data/9/9campaigns.csv', 'bdc2324-data/9/9categories.csv', 'bdc2324-data/9/9countries.csv', 'bdc2324-data/9/9currencies.csv', 'bdc2324-data/9/9customer_target_mappings.csv', 'bdc2324-data/9/9customersplus.csv', 'bdc2324-data/9/9event_types.csv', 'bdc2324-data/9/9events.csv', 'bdc2324-data/9/9facilities.csv', 'bdc2324-data/9/9link_stats.csv', 'bdc2324-data/9/9pricing_formulas.csv', 'bdc2324-data/9/9product_packs.csv', 'bdc2324-data/9/9products.csv', 'bdc2324-data/9/9products_groups.csv', 'bdc2324-data/9/9purchases.csv', 'bdc2324-data/9/9representation_category_capacities.csv', 'bdc2324-data/9/9representations.csv', 'bdc2324-data/9/9seasons.csv', 'bdc2324-data/9/9suppliers.csv', 'bdc2324-data/9/9target_types.csv', 'bdc2324-data/9/9targets.csv', 'bdc2324-data/9/9tickets.csv']\n",
"bdc2324-data/10\n",
"['bdc2324-data/10/10campaign_stats.csv', 'bdc2324-data/10/10campaigns.csv', 'bdc2324-data/10/10categories.csv', 'bdc2324-data/10/10countries.csv', 'bdc2324-data/10/10currencies.csv', 'bdc2324-data/10/10customer_target_mappings.csv', 'bdc2324-data/10/10customersplus.csv', 'bdc2324-data/10/10event_types.csv', 'bdc2324-data/10/10events.csv', 'bdc2324-data/10/10facilities.csv', 'bdc2324-data/10/10link_stats.csv', 'bdc2324-data/10/10pricing_formulas.csv', 'bdc2324-data/10/10product_packs.csv', 'bdc2324-data/10/10products.csv', 'bdc2324-data/10/10products_groups.csv', 'bdc2324-data/10/10purchases.csv', 'bdc2324-data/10/10representation_category_capacities.csv', 'bdc2324-data/10/10representation_types.csv', 'bdc2324-data/10/10representations.csv', 'bdc2324-data/10/10seasons.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/10/10tags.csv', 'bdc2324-data/10/10target_types.csv', 'bdc2324-data/10/10targets.csv', 'bdc2324-data/10/10tickets.csv', 'bdc2324-data/10/10type_of_pricing_formulas.csv', 'bdc2324-data/10/10type_ofs.csv']\n",
"bdc2324-data/11\n",
"['bdc2324-data/11/11campaign_stats.csv', 'bdc2324-data/11/11campaigns.csv', 'bdc2324-data/11/11categories.csv', 'bdc2324-data/11/11countries.csv', 'bdc2324-data/11/11currencies.csv', 'bdc2324-data/11/11customer_target_mappings.csv', 'bdc2324-data/11/11customersplus.csv', 'bdc2324-data/11/11event_types.csv', 'bdc2324-data/11/11events.csv', 'bdc2324-data/11/11facilities.csv', 'bdc2324-data/11/11link_stats.csv', 'bdc2324-data/11/11pricing_formulas.csv', 'bdc2324-data/11/11product_packs.csv', 'bdc2324-data/11/11products.csv', 'bdc2324-data/11/11products_groups.csv', 'bdc2324-data/11/11purchases.csv', 'bdc2324-data/11/11representation_category_capacities.csv', 'bdc2324-data/11/11representations.csv', 'bdc2324-data/11/11seasons.csv', 'bdc2324-data/11/11structure_tag_mappings.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/11/11tags.csv', 'bdc2324-data/11/11target_types.csv', 'bdc2324-data/11/11targets.csv', 'bdc2324-data/11/11tickets.csv']\n",
"bdc2324-data/12\n",
"['bdc2324-data/12/12campaign_stats.csv', 'bdc2324-data/12/12campaigns.csv', 'bdc2324-data/12/12categories.csv', 'bdc2324-data/12/12consumptions.csv', 'bdc2324-data/12/12countries.csv', 'bdc2324-data/12/12currencies.csv', 'bdc2324-data/12/12customer_target_mappings.csv', 'bdc2324-data/12/12customersplus.csv', 'bdc2324-data/12/12event_types.csv', 'bdc2324-data/12/12events.csv', 'bdc2324-data/12/12facilities.csv', 'bdc2324-data/12/12link_stats.csv', 'bdc2324-data/12/12pricing_formulas.csv', 'bdc2324-data/12/12product_packs.csv', 'bdc2324-data/12/12products.csv', 'bdc2324-data/12/12products_groups.csv', 'bdc2324-data/12/12purchases.csv', 'bdc2324-data/12/12representation_category_capacities.csv', 'bdc2324-data/12/12representations.csv', 'bdc2324-data/12/12seasons.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/12/12target_types.csv', 'bdc2324-data/12/12targets.csv', 'bdc2324-data/12/12tickets.csv', 'bdc2324-data/12/12type_ofs.csv']\n",
"bdc2324-data/13\n",
"['bdc2324-data/13/13campaign_stats.csv', 'bdc2324-data/13/13campaigns.csv', 'bdc2324-data/13/13categories.csv', 'bdc2324-data/13/13countries.csv', 'bdc2324-data/13/13currencies.csv', 'bdc2324-data/13/13customer_target_mappings.csv', 'bdc2324-data/13/13customersplus.csv', 'bdc2324-data/13/13event_types.csv', 'bdc2324-data/13/13events.csv', 'bdc2324-data/13/13facilities.csv', 'bdc2324-data/13/13link_stats.csv', 'bdc2324-data/13/13pricing_formulas.csv', 'bdc2324-data/13/13product_packs.csv', 'bdc2324-data/13/13products.csv', 'bdc2324-data/13/13products_groups.csv', 'bdc2324-data/13/13purchases.csv', 'bdc2324-data/13/13representation_category_capacities.csv', 'bdc2324-data/13/13representation_types.csv', 'bdc2324-data/13/13representations.csv', 'bdc2324-data/13/13seasons.csv', 'bdc2324-data/13/13structure_tag_mappings.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/13/13tags.csv', 'bdc2324-data/13/13target_types.csv', 'bdc2324-data/13/13targets.csv', 'bdc2324-data/13/13tickets.csv']\n",
"bdc2324-data/14\n",
"['bdc2324-data/14/14campaign_stats.csv', 'bdc2324-data/14/14campaigns.csv', 'bdc2324-data/14/14categories.csv', 'bdc2324-data/14/14countries.csv', 'bdc2324-data/14/14currencies.csv', 'bdc2324-data/14/14customer_target_mappings.csv', 'bdc2324-data/14/14customersplus.csv', 'bdc2324-data/14/14event_types.csv', 'bdc2324-data/14/14events.csv', 'bdc2324-data/14/14facilities.csv', 'bdc2324-data/14/14link_stats.csv', 'bdc2324-data/14/14pricing_formulas.csv', 'bdc2324-data/14/14product_packs.csv', 'bdc2324-data/14/14products.csv', 'bdc2324-data/14/14products_groups.csv', 'bdc2324-data/14/14purchases.csv', 'bdc2324-data/14/14representation_category_capacities.csv', 'bdc2324-data/14/14representation_types.csv', 'bdc2324-data/14/14representations.csv', 'bdc2324-data/14/14seasons.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/14/14target_types.csv', 'bdc2324-data/14/14targets.csv', 'bdc2324-data/14/14tickets.csv', 'bdc2324-data/14/14type_of_categories.csv', 'bdc2324-data/14/14type_of_pricing_formulas.csv', 'bdc2324-data/14/14type_ofs.csv']\n",
"bdc2324-data/101\n",
"['bdc2324-data/101/101campaign_stats.csv', 'bdc2324-data/101/101campaigns.csv', 'bdc2324-data/101/101categories.csv', 'bdc2324-data/101/101contribution_sites.csv', 'bdc2324-data/101/101contributions.csv', 'bdc2324-data/101/101countries.csv', 'bdc2324-data/101/101currencies.csv', 'bdc2324-data/101/101customer_target_mappings.csv', 'bdc2324-data/101/101customersplus.csv', 'bdc2324-data/101/101event_types.csv', 'bdc2324-data/101/101events.csv', 'bdc2324-data/101/101facilities.csv', 'bdc2324-data/101/101link_stats.csv', 'bdc2324-data/101/101pricing_formulas.csv', 'bdc2324-data/101/101product_packs.csv', 'bdc2324-data/101/101products.csv', 'bdc2324-data/101/101products_groups.csv', 'bdc2324-data/101/101purchases.csv', 'bdc2324-data/101/101representation_category_capacities.csv', 'bdc2324-data/101/101representations.csv', 'bdc2324-data/101/101seasons.csv', 'bdc2324-data/101/101structure_tag_mappings.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/101/101tags.csv', 'bdc2324-data/101/101target_types.csv', 'bdc2324-data/101/101targets.csv', 'bdc2324-data/101/101tickets.csv', 'bdc2324-data/101/101tickets_1.csv', 'bdc2324-data/101/101type_of_pricing_formulas.csv', 'bdc2324-data/101/101type_ofs.csv']\n"
]
}
],
"source": [
"# Liste des jeu de données par dossier\n",
"for i in range(1, 15):\n",
" FILE_PATH_S3 = BUCKET + \"/\" + str(i)\n",
" print(FILE_PATH_S3)\n",
" print(fs.ls(FILE_PATH_S3))\n",
"print(BUCKET + \"/101\")\n",
"print(fs.ls(BUCKET + \"/101\"))"
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": 4,
2023-12-24 15:29:39 +01:00
"id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
"metadata": {},
2023-12-24 20:00:28 +01:00
"outputs": [],
"source": [
"# Chargement des fichiers campaign_stats.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": 5,
2023-12-24 20:00:28 +01:00
"id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
"metadata": {},
"outputs": [],
"source": [
"# Conversion des dates 'sent_at'\n",
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": 6,
2023-12-24 20:00:28 +01:00
"id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
"metadata": {},
2023-12-24 15:29:39 +01:00
"outputs": [
{
2023-12-24 20:00:28 +01:00
"name": "stdout",
"output_type": "stream",
"text": [
"2023-11-09 18:10:45+00:00\n",
"2020-06-02 08:24:08+00:00\n",
"2023-10-12 01:39:48+00:00\n",
"2023-10-10 17:06:29+00:00\n",
"2023-11-01 09:20:48+00:00\n",
"2021-03-31 14:59:02+00:00\n"
2023-12-24 15:29:39 +01:00
]
}
],
"source": [
2023-12-24 20:00:28 +01:00
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
"print(campaign_stats_1['sent_at'].max())\n",
"print(campaign_stats_1['sent_at'].min())\n",
"\n",
"print(campaign_stats_2['sent_at'].max())\n",
"print(campaign_stats_2['sent_at'].min())\n",
"\n",
"print(campaign_stats_3['sent_at'].max())\n",
"print(campaign_stats_3['sent_at'].min())"
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": 7,
2023-12-24 20:00:28 +01:00
"id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2024-01-01 13:31:02 +01:00
"0 2021-03-28 16:01:09+00:00\n",
"1 2021-03-28 16:01:09+00:00\n",
"2 2021-03-28 16:00:59+00:00\n",
"3 2021-03-28 16:00:59+00:00\n",
"4 2021-03-28 16:01:06+00:00\n",
" ... \n",
"6214803 2023-10-23 09:32:33+00:00\n",
"6214804 2023-10-23 09:32:49+00:00\n",
"6214805 2023-10-23 09:33:28+00:00\n",
"6214806 2023-10-23 09:31:53+00:00\n",
"6214807 2023-10-23 09:33:54+00:00\n",
"Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
2023-12-24 20:00:28 +01:00
]
},
2024-01-01 13:31:02 +01:00
"execution_count": 7,
2023-12-24 20:00:28 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"campaign_stats_1['sent_at']"
]
},
{
"cell_type": "markdown",
"id": "31f2edbf-5661-4516-9835-06d4da615c13",
"metadata": {},
"source": [
"### Customersplus.csv"
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": 8,
2023-12-24 20:00:28 +01:00
"id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-01-01 13:31:02 +01:00
"/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2023-12-24 20:00:28 +01:00
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
]
}
],
"source": [
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
2023-12-24 15:29:39 +01:00
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
2023-12-24 20:00:28 +01:00
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
2023-12-24 15:29:39 +01:00
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": 10,
2023-12-24 20:00:28 +01:00
"id": "460f853a-68c0-42a7-9877-b83d3aaec813",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
" 'average_purchase_delay', 'average_price_basket',\n",
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
" 'tenant_id'],\n",
" dtype='object')"
]
},
2024-01-01 13:31:02 +01:00
"execution_count": 10,
2023-12-24 20:00:28 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-01-01 13:31:02 +01:00
"customers_plus_1.columns"
2023-12-24 20:00:28 +01:00
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": null,
2023-12-24 20:00:28 +01:00
"id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
"metadata": {},
2024-01-01 13:31:02 +01:00
"outputs": [],
2023-12-24 20:00:28 +01:00
"source": [
2024-01-01 13:31:02 +01:00
"customers_plus_1.shape"
2023-12-24 20:00:28 +01:00
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": null,
2023-12-24 20:00:28 +01:00
"id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
"metadata": {},
2024-01-01 13:31:02 +01:00
"outputs": [],
2023-12-24 20:00:28 +01:00
"source": [
"customers_plus_1['id'].nunique()"
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": null,
2023-12-24 20:00:28 +01:00
"id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
"metadata": {},
2024-01-01 13:31:02 +01:00
"outputs": [],
2023-12-24 20:00:28 +01:00
"source": [
"customers_plus_2['id'].nunique()"
]
},
{
"cell_type": "code",
2024-01-01 13:31:02 +01:00
"execution_count": null,
2023-12-24 20:00:28 +01:00
"id": "b40a653e-013f-48d0-8b57-0284587b36c5",
2023-12-24 15:29:39 +01:00
"metadata": {},
"outputs": [],
2023-12-24 20:00:28 +01:00
"source": [
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "32fa2215-3c79-40b5-8643-755865959fc7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
"# Exemple id commun = caractéristiques communes\n",
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
"\n",
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
2024-01-02 22:45:25 +01:00
"metadata": {
"scrolled": true
},
2023-12-24 20:00:28 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id 0.000000\n",
"lastname 43.461341\n",
"firstname 44.995588\n",
"birthdate 96.419870\n",
"email 8.622075\n",
"street_id 0.000000\n",
"created_at 0.000000\n",
"updated_at 0.000000\n",
"civility 100.000000\n",
"is_partner 0.000000\n",
"extra 100.000000\n",
"deleted_at 100.000000\n",
"reference 100.000000\n",
"gender 0.000000\n",
"is_email_true 0.000000\n",
"extra_field 100.000000\n",
"identifier 0.000000\n",
"opt_in 0.000000\n",
"structure_id 88.072380\n",
"note 99.403421\n",
"profession 95.913503\n",
"language 99.280945\n",
"mcp_contact_id 34.876141\n",
"need_reload 0.000000\n",
"last_buying_date 51.653431\n",
"max_price 51.653431\n",
"ticket_sum 0.000000\n",
"average_price 8.639195\n",
"fidelity 0.000000\n",
"average_purchase_delay 51.653431\n",
"average_price_basket 51.653431\n",
"average_ticket_basket 51.653431\n",
"total_price 43.014236\n",
"preferred_category 100.000000\n",
"preferred_supplier 100.000000\n",
"preferred_formula 100.000000\n",
"purchase_count 0.000000\n",
"first_buying_date 51.653431\n",
"last_visiting_date 100.000000\n",
"zipcode 71.176564\n",
"country 5.459418\n",
"age 96.419870\n",
"tenant_id 0.000000\n",
"dtype: float64\n"
]
}
],
"source": [
2024-01-01 13:31:02 +01:00
"pd.DataFrame(customers_plus_1.isna().mean()*100)"
2023-12-24 20:00:28 +01:00
]
2023-12-22 09:58:14 +01:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}