{ "cells": [ { "cell_type": "markdown", "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": null, "id": "88af2795-8bf9-4df0-a059-be7c28fb4289", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": null, "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f", "metadata": {}, "outputs": [], "source": [ "import os\n", "import s3fs\n", "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "BUCKET = \"bdc2324-data\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": null, "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763", "metadata": {}, "outputs": [], "source": [ "# Chargement des fichiers campaign_stats.csv\n", "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56", "metadata": {}, "outputs": [], "source": [ "# Conversion des dates 'sent_at'\n", "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" ] }, { "cell_type": "code", "execution_count": null, "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135", "metadata": {}, "outputs": [], "source": [ "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", "print(campaign_stats_1['sent_at'].max())\n", "print(campaign_stats_1['sent_at'].min())\n", "\n", "print(campaign_stats_2['sent_at'].max())\n", "print(campaign_stats_2['sent_at'].min())\n", "\n", "print(campaign_stats_3['sent_at'].max())\n", "print(campaign_stats_3['sent_at'].min())" ] }, { "cell_type": "code", "execution_count": null, "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b", "metadata": {}, "outputs": [], "source": [ "campaign_stats_1['sent_at']" ] }, { "cell_type": "markdown", "id": "31f2edbf-5661-4516-9835-06d4da615c13", "metadata": {}, "source": [ "### Customersplus.csv" ] }, { "cell_type": "code", "execution_count": null, "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092", "metadata": {}, "outputs": [], "source": [ "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": null, "id": "460f853a-68c0-42a7-9877-b83d3aaec813", "metadata": {}, "outputs": [], "source": [ "customers_plus_1.columns" ] }, { "cell_type": "code", "execution_count": null, "id": "d5a9398f-72fc-4548-9f53-b20b372144b2", "metadata": {}, "outputs": [], "source": [ "customers_plus_1.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638", "metadata": {}, "outputs": [], "source": [ "customers_plus_1['id'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440", "metadata": {}, "outputs": [], "source": [ "customers_plus_2['id'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "b40a653e-013f-48d0-8b57-0284587b36c5", "metadata": {}, "outputs": [], "source": [ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" ] }, { "cell_type": "code", "execution_count": null, "id": "32fa2215-3c79-40b5-8643-755865959fc7", "metadata": {}, "outputs": [], "source": [ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", "# Exemple id commun = caractéristiques communes\n", "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", "\n", "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" ] }, { "cell_type": "code", "execution_count": null, "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d", "metadata": { "scrolled": true }, "outputs": [], "source": [ "customers_plus_1.isna().mean()*100" ] }, { "cell_type": "code", "execution_count": null, "id": "6f6ce60d-0912-497d-9108-330acccef394", "metadata": {}, "outputs": [], "source": [ "# Chargement de toutes les données\n", "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", "\n", "for nom_base in liste_base:\n", " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": null, "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Jointure\n", "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n", "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n", "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n", "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n", "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n", "df_customer_event" ] }, { "cell_type": "markdown", "id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f", "metadata": {}, "source": [ "# Fusion et exploration" ] }, { "cell_type": "code", "execution_count": null, "id": "22bfad2b-d52a-4077-9b39-bee35004e01c", "metadata": {}, "outputs": [], "source": [ "# Jointure\n", "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n", "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n", "\n", "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n", "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n", "\n", "var_choosed.remove('representation_id')\n", "var_choosed.extend(['start_date_time', 'event_id'])\n", "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n", "\n", "var_choosed.remove('event_id')\n", "var_choosed.extend(['name', 'customer_id'])\n", "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n", "\n", "# Changement de nom\n", "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", "var_choosed[var_choosed.index('name')] = \"event_name\"\n", "\n", "# Base finale\n", "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n", "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n", "df_customer_event" ] }, { "cell_type": "markdown", "id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31", "metadata": {}, "source": [ "## Type de client au globale" ] }, { "cell_type": "code", "execution_count": null, "id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1", "metadata": {}, "outputs": [], "source": [ "# Client\n", "print(customer_target_mappings.columns)\n", "print(customer_target_mappings.shape)\n", "customer_target_mappings.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f", "metadata": {}, "outputs": [], "source": [ "customer_target_mappings['extra_field'].unique()" ] }, { "cell_type": "code", "execution_count": null, "id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec", "metadata": {}, "outputs": [], "source": [ "customer_target_mappings['name'].unique()" ] }, { "cell_type": "code", "execution_count": null, "id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4", "metadata": {}, "outputs": [], "source": [ "# Segmentation existante\n", "print(target_types.columns)\n", "print(target_types.shape)\n", "target_types.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "5adb1773-648d-4683-bc08-d1f2298c1283", "metadata": { "scrolled": true }, "outputs": [], "source": [ "target_types" ] }, { "cell_type": "code", "execution_count": null, "id": "3d65f74e-47fc-4296-b493-a1ebefb91cde", "metadata": {}, "outputs": [], "source": [ "# Tags = clients\n", "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " tags = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(tags.columns)\n", "print(tags.shape)\n", "tags.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "8a689a63-165b-4c4e-bbb0-695b661048d9", "metadata": {}, "outputs": [], "source": [ "tags" ] }, { "cell_type": "code", "execution_count": null, "id": "69e38c52-0570-4531-aebb-9deb6db8c40b", "metadata": {}, "outputs": [], "source": [ "# Structure = clients\n", "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(structure_tag_mappings.columns)\n", "print(structure_tag_mappings.shape)\n", "structure_tag_mappings.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "74dc34ad-375b-48df-a900-40d92c5fff13", "metadata": {}, "outputs": [], "source": [ "structure_tag_mappings" ] }, { "cell_type": "code", "execution_count": null, "id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe", "metadata": {}, "outputs": [], "source": [ "# Tags = clients\n", "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customersplus = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(customersplus.columns)\n", "print(customersplus.shape)\n", "customersplus.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "383e892c-606a-45ce-bdd6-b503b3e0be33", "metadata": {}, "outputs": [], "source": [ "customersplus" ] }, { "cell_type": "code", "execution_count": null, "id": "70324d06-b855-4386-a7de-eef1eb13dfdf", "metadata": {}, "outputs": [], "source": [ "# But : lier les caractéristiques socio-demo et les comportements d'achat\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c", "metadata": {}, "outputs": [], "source": [ "# tickets\n", "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " tickets = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(tickets.columns)\n", "print(tickets.shape)\n", "tickets.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "ea83ea5c-3d47-4a66-a523-04b69b149a20", "metadata": {}, "outputs": [], "source": [ "tickets" ] }, { "cell_type": "code", "execution_count": null, "id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6", "metadata": {}, "outputs": [], "source": [ "tickets['type_of'].unique()" ] }, { "cell_type": "markdown", "id": "bc192b08-30a5-486a-8bea-93e765dbfce6", "metadata": {}, "source": [ "## Types d'évenement et client" ] }, { "cell_type": "code", "execution_count": null, "id": "e14dcf62-2def-4ed5-834b-cf21abbc2894", "metadata": {}, "outputs": [], "source": [ "# Evenement = events.csv\n", "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " events = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(events.columns)\n", "print(events.shape)\n", "events.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316", "metadata": {}, "outputs": [], "source": [ "events" ] }, { "cell_type": "code", "execution_count": null, "id": "af80eee8-f717-4159-a0fd-09d47ec96621", "metadata": {}, "outputs": [], "source": [ "events['name'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2", "metadata": {}, "outputs": [], "source": [ "# Représentation des évenements = representations.csv\n", "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " representations = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(representations.columns)\n", "print(representations.shape)\n", "representations.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "1487402a-a49b-4737-b7d7-40c764d2f0b4", "metadata": {}, "outputs": [], "source": [ "representations" ] }, { "cell_type": "code", "execution_count": null, "id": "99b27418-2c15-4a6e-bcf5-d329ca492085", "metadata": {}, "outputs": [], "source": [ "# Produits vendues = products.csv\n", "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " products = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(products.columns)\n", "print(products.shape)\n", "products.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "c49bcd47-672f-4e0f-aee9-a7475151b97f", "metadata": {}, "outputs": [], "source": [ "products" ] }, { "cell_type": "code", "execution_count": null, "id": "a4aec5ce-d0c9-4625-bb29-9ac154818621", "metadata": {}, "outputs": [], "source": [ "# Lieu = facilities.csv\n", "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " facilities = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(facilities.columns)\n", "print(facilities.shape)\n", "facilities.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "b3642483-2879-442a-ad69-efcd2331a200", "metadata": {}, "outputs": [], "source": [ "facilities" ] }, { "cell_type": "code", "execution_count": null, "id": "da1e9807-2a8d-4be7-a785-55cffd734f36", "metadata": {}, "outputs": [], "source": [ "# Saisons = seasons.csv période sur deux années consécutives\n", "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " seasons = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(seasons.columns)\n", "print(seasons.shape)\n", "seasons.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9", "metadata": {}, "outputs": [], "source": [ "seasons['name'].unique()" ] }, { "cell_type": "code", "execution_count": null, "id": "abb3aa20-774b-4761-983a-df5eb2bc51c6", "metadata": {}, "outputs": [], "source": [ "# Achats = purchases.csv \n", "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " purchases = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(purchases.columns)\n", "print(purchases.shape)\n", "purchases.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "30e204ab-4f63-430c-a818-5c8035b6e17b", "metadata": {}, "outputs": [], "source": [ "purchases" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }