{ "cells": [ { "cell_type": "markdown", "id": "56b3d44e-1e3f-4726-9916-0f9af107860e", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "15103481-8d74-404c-aa09-7601fe7730da", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re" ] }, { "cell_type": "markdown", "id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "markdown", "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c", "metadata": {}, "source": [ "# Exemple sur Company 1" ] }, { "cell_type": "markdown", "id": "9d74b68f-ba07-4a15-9a27-dae931762d70", "metadata": {}, "source": [ "## Chargement données" ] }, { "cell_type": "code", "execution_count": 3, "id": "699664b9-eee4-4f8d-a207-e524526560c5", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data/1\"\n", "liste_database = fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 28, "id": "0cb92854-903b-4efd-ac1b-197e29f044b4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv']\n" ] } ], "source": [ "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n", "\n", "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n", "\n", "# Afficher le résultat\n", "print(liste_database_filtered)" ] }, { "cell_type": "code", "execution_count": 29, "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_4561/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } ], "source": [ "# loop to create dataframes from file 2\n", "files_path = liste_database_filtered\n", "\n", "client_number = files_path[0].split(\"/\")[1]\n", "df_prefix = \"df\" + str(client_number) + \"_\"\n", "\n", "for i in range(len(files_path)) :\n", " current_path = files_path[i]\n", " with fs.open(current_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in)\n", " # the pattern of the name is df1xxx\n", " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", " globals()[nom_dataframe] = df" ] }, { "cell_type": "markdown", "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52", "metadata": {}, "source": [ "## suppliers.csv" ] }, { "cell_type": "code", "execution_count": 18, "id": "2e0dada0-9457-484c-aa55-77e44613ecca", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamemanually_addedlabelitrupdated_atcreated_atcommissionidentifier
01617j4 administrationFalseNaNNaN2021-07-29 09:21:37.325772+02:002021-07-29 09:21:37.325772+02:00NaN5958b2a060ac3e31678b438892a1bd2e
18non définiFalseNaNNaN2020-09-03 13:16:35.329062+02:002020-09-03 13:16:35.329062+02:00NaN52ff3466787b4d538407372e5f7afe0f
24vadFalseNaNNaN2020-09-03 13:11:23.896992+02:002020-09-03 13:11:23.896992+02:00NaN1225483c97b36018cab2bea14ab78ea6
31fort saint jeanFalseNaNNaN2020-09-03 13:11:23.833073+02:002020-09-03 13:11:23.833073+02:00NaN001b9b4a524fe407150b8235b304d4ec
42j4FalseNaNNaN2020-09-03 13:11:23.888993+02:002020-09-03 13:11:23.888993+02:00NaN6a0cf6edf20060344b465706b61719aa
55revendeurFalseNaNNaN2020-09-03 13:11:23.900987+02:002020-09-03 13:11:23.900987+02:00NaN931239d4acb6214d7e5c98edecfb4916
63vente en ligneFalseNaNNaN2020-09-03 13:11:23.893097+02:002020-09-03 13:11:23.893097+02:00NaNbde8f2ccff510df8572d3214d86b837d
76ccrFalseNaNNaN2020-09-03 13:11:23.904974+02:002020-09-03 13:11:23.904974+02:00NaNb48ec279411f7dbbb68393c61a9724d9
87dabFalseNaNNaN2020-09-03 13:11:23.908970+02:002020-09-03 13:11:23.908970+02:00NaN11c6d471fa4e354e62e684d293694202
\n", "
" ], "text/plain": [ " id name manually_added label itr \\\n", "0 1617 j4 administration False NaN NaN \n", "1 8 non défini False NaN NaN \n", "2 4 vad False NaN NaN \n", "3 1 fort saint jean False NaN NaN \n", "4 2 j4 False NaN NaN \n", "5 5 revendeur False NaN NaN \n", "6 3 vente en ligne False NaN NaN \n", "7 6 ccr False NaN NaN \n", "8 7 dab False NaN NaN \n", "\n", " updated_at created_at \\\n", "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n", "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n", "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n", "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n", "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n", "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n", "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n", "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n", "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n", "\n", " commission identifier \n", "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n", "1 NaN 52ff3466787b4d538407372e5f7afe0f \n", "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n", "3 NaN 001b9b4a524fe407150b8235b304d4ec \n", "4 NaN 6a0cf6edf20060344b465706b61719aa \n", "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n", "6 NaN bde8f2ccff510df8572d3214d86b837d \n", "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n", "8 NaN 11c6d471fa4e354e62e684d293694202 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Restriction aux DataFrame : ticket, purchase, consumption, suppliers\n", "df1_suppliers" ] }, { "cell_type": "code", "execution_count": 19, "id": "b583be02-ab60-4e14-9325-0204f203a1af", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 9 entries, 0 to 8\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 9 non-null int64 \n", " 1 name 9 non-null object \n", " 2 manually_added 9 non-null bool \n", " 3 label 0 non-null float64\n", " 4 itr 0 non-null float64\n", " 5 updated_at 9 non-null object \n", " 6 created_at 9 non-null object \n", " 7 commission 0 non-null float64\n", " 8 identifier 9 non-null object \n", "dtypes: bool(1), float64(3), int64(1), object(4)\n", "memory usage: 713.0+ bytes\n" ] } ], "source": [ "df1_suppliers.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e", "metadata": {}, "outputs": [], "source": [ "df1_suppliers.isna().mean()*100" ] }, { "cell_type": "code", "execution_count": null, "id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72", "metadata": {}, "source": [ "## purchases.csv" ] }, { "cell_type": "code", "execution_count": null, "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "79c9eb43-002e-460d-acb2-206ebb2ab6dd", "metadata": {}, "source": [ "## tickets.csv" ] }, { "cell_type": "code", "execution_count": null, "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618", "metadata": {}, "outputs": [], "source": [ "df1_purchases\n", "df1_tickets" ] }, { "cell_type": "markdown", "id": "355f5489-7904-4161-a85b-6eb70b3a4c89", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "# Fusion et exploration" ] }, { "cell_type": "code", "execution_count": 24, "id": "afe548fe-d93c-4634-9f53-881404ec4c6c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xpurchase_datetype_ofis_from_subscriptionamountis_full_pricestart_date_timeevent_name
09924232023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
19924232023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
210539342023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
310539342023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
411891412020-11-26 13:12:53+01:003False51.3False2020-12-01 20:00:00+01:00iphigenie en tauride
...........................
31896410908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896510908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896610908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896712442772019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
31896812442772019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
\n", "

318969 rows × 8 columns

\n", "
" ], "text/plain": [ " id_x purchase_date type_of is_from_subscription \\\n", "0 992423 2023-01-11 17:08:41+01:00 3 False \n", "1 992423 2023-01-11 17:08:41+01:00 3 False \n", "2 1053934 2023-03-16 16:23:10+01:00 3 False \n", "3 1053934 2023-03-16 16:23:10+01:00 3 False \n", "4 1189141 2020-11-26 13:12:53+01:00 3 False \n", "... ... ... ... ... \n", "318964 1090839 2019-05-19 21:18:36+02:00 1 False \n", "318965 1090839 2019-05-19 21:18:36+02:00 1 False \n", "318966 1090839 2019-05-19 21:18:36+02:00 1 False \n", "318967 1244277 2019-12-31 11:04:07+01:00 1 False \n", "318968 1244277 2019-12-31 11:04:07+01:00 1 False \n", "\n", " amount is_full_price start_date_time event_name \n", "0 13.0 False 2023-02-06 20:00:00+01:00 zaide \n", "1 13.0 False 2023-02-06 20:00:00+01:00 zaide \n", "2 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n", "3 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n", "4 51.3 False 2020-12-01 20:00:00+01:00 iphigenie en tauride \n", "... ... ... ... ... \n", "318964 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", "318965 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", "318966 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", "318967 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n", "318968 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n", "\n", "[318969 rows x 8 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Jointure\n", "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n", "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n", "\n", "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n", "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n", "\n", "var_choosed.remove('representation_id')\n", "var_choosed.extend(['start_date_time', 'event_id'])\n", "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n", "\n", "var_choosed.remove('event_id')\n", "var_choosed.extend(['name', 'customer_id'])\n", "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n", "\n", "# Changement de nom\n", "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", "var_choosed[var_choosed.index('name')] = \"event_name\"\n", "\n", "# Base finale\n", "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n", "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n", "df_customer_event" ] }, { "cell_type": "markdown", "id": "779da86b-ac61-4c61-88d2-fa1c0c19efce", "metadata": {}, "source": [ "## Type de client au globale" ] }, { "cell_type": "code", "execution_count": 4, "id": "7c89d25f-ee42-4478-9ff0-ee64b781d5c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n", " 'extra_field'],\n", " dtype='object')\n", "(124302, 7)\n", "\n", "RangeIndex: 124302 entries, 0 to 124301\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 124302 non-null int64 \n", " 1 customer_id 124302 non-null int64 \n", " 2 target_id 124302 non-null int64 \n", " 3 created_at 124296 non-null object \n", " 4 updated_at 124296 non-null object \n", " 5 name 0 non-null float64\n", " 6 extra_field 0 non-null float64\n", "dtypes: float64(2), int64(3), object(2)\n", "memory usage: 6.6+ MB\n" ] } ], "source": [ "# Client\n", "print(customer_target_mappings.columns)\n", "print(customer_target_mappings.shape)\n", "customer_target_mappings.info()" ] }, { "cell_type": "code", "execution_count": 26, "id": "c4b6bdcc-9f13-449b-9a8b-c5ca794637be", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([nan])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customer_target_mappings['extra_field'].unique()" ] }, { "cell_type": "code", "execution_count": 27, "id": "47bc8453-0693-4838-8bd8-4d800a82c496", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([nan])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customer_target_mappings['name'].unique()" ] }, { "cell_type": "code", "execution_count": 11, "id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n", "(4, 6)\n", "\n", "RangeIndex: 4 entries, 0 to 3\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 4 non-null int64 \n", " 1 is_import 4 non-null bool \n", " 2 name 4 non-null object\n", " 3 created_at 4 non-null object\n", " 4 updated_at 4 non-null object\n", " 5 identifier 4 non-null object\n", "dtypes: bool(1), int64(1), object(4)\n", "memory usage: 292.0+ bytes\n" ] } ], "source": [ "# Segmentation existante\n", "print(target_types.columns)\n", "print(target_types.shape)\n", "target_types.info()" ] }, { "cell_type": "code", "execution_count": 12, "id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idis_importnamecreated_atupdated_atidentifier
01Falsemanual_static_filter2021-04-29 13:42:14.111085+02:002021-04-29 13:42:14.111085+02:00fb27e81baa4debc6a4e1a8639c20e808
13Truemanual_structure2021-05-07 15:20:00.626650+02:002021-05-07 15:20:00.626650+02:00382bca214204a2d3462f5ec2728d5d1e
26Falsemanual_dynamic_filter2021-09-09 14:27:47.641302+02:002021-09-09 14:27:47.641302+02:00e0f4b8693184850fefd6d2a38f10584e
32Truemanual_import2021-04-29 13:49:30.107110+02:002021-04-29 13:49:30.107110+02:0012213df2ce68a624e4c0070521437bac
\n", "
" ], "text/plain": [ " id is_import name created_at \\\n", "0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n", "1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n", "2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n", "3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n", "\n", " updated_at identifier \n", "0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n", "1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n", "2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n", "3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_types" ] }, { "cell_type": "code", "execution_count": 17, "id": "8dd74e87-97c2-493d-b19f-971b684078d3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n", "(20, 5)\n", "\n", "RangeIndex: 20 entries, 0 to 19\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 20 non-null int64 \n", " 1 name 19 non-null object\n", " 2 created_at 20 non-null object\n", " 3 updated_at 20 non-null object\n", " 4 identifier 20 non-null object\n", "dtypes: int64(1), object(4)\n", "memory usage: 928.0+ bytes\n" ] } ], "source": [ "# Tags = clients\n", "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " tags = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(tags.columns)\n", "print(tags.shape)\n", "tags.info()" ] }, { "cell_type": "code", "execution_count": 18, "id": "91d54732-666c-4250-ba91-5c9b83d4712a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atidentifier
02ens-écoles2021-05-07 15:24:19.808501+02:002021-05-07 15:24:19.808501+02:00b6a360c5f84595940c5774f13fd39cc3
11NaN2021-05-07 15:24:19.805589+02:002021-05-07 15:24:19.805589+02:00d41d8cd98f00b204e9800998ecf8427e
24ecoles primaires rennes2021-05-07 15:29:06.388415+02:002021-05-07 15:29:06.388415+02:00ca8649dd64c240d118f60b07d11a7053
35Angers Nantes Opéra2023-01-27 15:59:58.187557+01:002023-01-27 15:59:58.187557+01:00f8f500f937fe312542399299cdc13f7e
46Opéras2023-01-27 16:03:59.654938+01:002023-01-27 16:03:59.654938+01:0022eb2c616983ec7b54a093f84b230505
57Ministère de la Culture2023-01-30 11:22:29.636813+01:002023-01-30 11:22:29.636813+01:001b8c5c08fde000d90905a3d14af7763d
68Orchestres2023-01-30 11:33:56.392799+01:002023-01-30 11:33:56.392799+01:007c2aee0c80642d7e325a450f2dec45e5
79Cooperative2023-01-31 14:44:38.471146+01:002023-01-31 14:44:38.471146+01:006c88c36ffaab88d255865aa3111d7686
810Théâtres2023-01-31 14:45:17.804428+01:002023-01-31 14:45:17.804428+01:00b2c19672df82021702b79482c8cda85a
911La co[opera]tive2023-02-16 17:11:35.004478+01:002023-02-16 17:11:35.004478+01:005dbaa3a1f278c0fcf981d447ad20957a
1012Ville de Rennes2023-02-16 17:37:13.816196+01:002023-02-16 17:37:13.816196+01:00bc483d04d9c3a08f167a3ce64366ca72
1113Ensembles en résidence2023-02-16 17:55:54.877374+01:002023-02-16 17:55:54.877374+01:00e70635e771de13268dccf02bb2abfaf9
1214Ministère2023-02-17 11:17:54.429462+01:002023-02-17 11:17:54.429462+01:00a3f0582853fd19f5b57e3651f8a20e7a
1315Rennes métropole2023-02-17 11:53:24.490786+01:002023-02-17 11:53:24.490786+01:00e98b8db5941b96c29c353b6f2f502055
1416Ville de Rennes - équipements culturels2023-02-17 12:00:10.649104+01:002023-02-17 12:00:10.649104+01:00a44edffc7edb852982efa7f4aa6d0e25
1517Structures culturelles rennaises2023-02-17 12:05:55.583016+01:002023-02-17 12:05:55.583016+01:00241550517e4e3b1c926e9aeab0f621cd
1618Université Rennes 22023-02-17 14:23:44.832959+01:002023-02-17 14:23:44.832959+01:004057c5cee51c4e10aa819f0cf48adc3f
1719Centres chorégraphiques nationaux2023-02-17 15:29:41.827321+01:002023-02-17 15:29:41.827321+01:0041e75941dfb766365498d917abe0102f
1820Télévision2023-02-17 15:46:13.746092+01:002023-02-17 15:46:13.746092+01:0036d6409c539dd79c1f3af8c5948603eb
1921structures culturelles nationales2023-02-17 15:56:00.555722+01:002023-02-17 15:56:00.555722+01:005311cf7e42aac53289e1c4a338d5cfa4
\n", "
" ], "text/plain": [ " id name \\\n", "0 2 ens-écoles \n", "1 1 NaN \n", "2 4 ecoles primaires rennes \n", "3 5 Angers Nantes Opéra \n", "4 6 Opéras \n", "5 7 Ministère de la Culture \n", "6 8 Orchestres \n", "7 9 Cooperative \n", "8 10 Théâtres \n", "9 11 La co[opera]tive \n", "10 12 Ville de Rennes \n", "11 13 Ensembles en résidence \n", "12 14 Ministère \n", "13 15 Rennes métropole \n", "14 16 Ville de Rennes - équipements culturels \n", "15 17 Structures culturelles rennaises \n", "16 18 Université Rennes 2 \n", "17 19 Centres chorégraphiques nationaux \n", "18 20 Télévision \n", "19 21 structures culturelles nationales \n", "\n", " created_at updated_at \\\n", "0 2021-05-07 15:24:19.808501+02:00 2021-05-07 15:24:19.808501+02:00 \n", "1 2021-05-07 15:24:19.805589+02:00 2021-05-07 15:24:19.805589+02:00 \n", "2 2021-05-07 15:29:06.388415+02:00 2021-05-07 15:29:06.388415+02:00 \n", "3 2023-01-27 15:59:58.187557+01:00 2023-01-27 15:59:58.187557+01:00 \n", "4 2023-01-27 16:03:59.654938+01:00 2023-01-27 16:03:59.654938+01:00 \n", "5 2023-01-30 11:22:29.636813+01:00 2023-01-30 11:22:29.636813+01:00 \n", "6 2023-01-30 11:33:56.392799+01:00 2023-01-30 11:33:56.392799+01:00 \n", "7 2023-01-31 14:44:38.471146+01:00 2023-01-31 14:44:38.471146+01:00 \n", "8 2023-01-31 14:45:17.804428+01:00 2023-01-31 14:45:17.804428+01:00 \n", "9 2023-02-16 17:11:35.004478+01:00 2023-02-16 17:11:35.004478+01:00 \n", "10 2023-02-16 17:37:13.816196+01:00 2023-02-16 17:37:13.816196+01:00 \n", "11 2023-02-16 17:55:54.877374+01:00 2023-02-16 17:55:54.877374+01:00 \n", "12 2023-02-17 11:17:54.429462+01:00 2023-02-17 11:17:54.429462+01:00 \n", "13 2023-02-17 11:53:24.490786+01:00 2023-02-17 11:53:24.490786+01:00 \n", "14 2023-02-17 12:00:10.649104+01:00 2023-02-17 12:00:10.649104+01:00 \n", "15 2023-02-17 12:05:55.583016+01:00 2023-02-17 12:05:55.583016+01:00 \n", "16 2023-02-17 14:23:44.832959+01:00 2023-02-17 14:23:44.832959+01:00 \n", "17 2023-02-17 15:29:41.827321+01:00 2023-02-17 15:29:41.827321+01:00 \n", "18 2023-02-17 15:46:13.746092+01:00 2023-02-17 15:46:13.746092+01:00 \n", "19 2023-02-17 15:56:00.555722+01:00 2023-02-17 15:56:00.555722+01:00 \n", "\n", " identifier \n", "0 b6a360c5f84595940c5774f13fd39cc3 \n", "1 d41d8cd98f00b204e9800998ecf8427e \n", "2 ca8649dd64c240d118f60b07d11a7053 \n", "3 f8f500f937fe312542399299cdc13f7e \n", "4 22eb2c616983ec7b54a093f84b230505 \n", "5 1b8c5c08fde000d90905a3d14af7763d \n", "6 7c2aee0c80642d7e325a450f2dec45e5 \n", "7 6c88c36ffaab88d255865aa3111d7686 \n", "8 b2c19672df82021702b79482c8cda85a \n", "9 5dbaa3a1f278c0fcf981d447ad20957a \n", "10 bc483d04d9c3a08f167a3ce64366ca72 \n", "11 e70635e771de13268dccf02bb2abfaf9 \n", "12 a3f0582853fd19f5b57e3651f8a20e7a \n", "13 e98b8db5941b96c29c353b6f2f502055 \n", "14 a44edffc7edb852982efa7f4aa6d0e25 \n", "15 241550517e4e3b1c926e9aeab0f621cd \n", "16 4057c5cee51c4e10aa819f0cf48adc3f \n", "17 41e75941dfb766365498d917abe0102f \n", "18 36d6409c539dd79c1f3af8c5948603eb \n", "19 5311cf7e42aac53289e1c4a338d5cfa4 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags" ] }, { "cell_type": "code", "execution_count": 19, "id": "4cc9f444-b7e6-4ee5-8ce8-64c63ab7825a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'structure_id', 'tag_id', 'created_at', 'updated_at'], dtype='object')\n", "(179, 5)\n", "\n", "RangeIndex: 179 entries, 0 to 178\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 179 non-null int64 \n", " 1 structure_id 179 non-null int64 \n", " 2 tag_id 179 non-null int64 \n", " 3 created_at 179 non-null object\n", " 4 updated_at 179 non-null object\n", "dtypes: int64(3), object(2)\n", "memory usage: 7.1+ KB\n" ] } ], "source": [ "# Structure = clients\n", "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(structure_tag_mappings.columns)\n", "print(structure_tag_mappings.shape)\n", "structure_tag_mappings.info()" ] }, { "cell_type": "code", "execution_count": 20, "id": "dcf776df-5c8e-4972-b2c1-b41291ba7e66", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idstructure_idtag_idcreated_atupdated_at
012318762023-01-27 16:03:59.680222+01:002023-01-27 16:03:59.680222+01:00
12222021-05-07 15:24:19.872895+02:002021-05-07 15:24:19.872895+02:00
23322021-05-07 15:24:19.873830+02:002021-05-07 15:24:19.873830+02:00
34422021-05-07 15:24:19.874628+02:002021-05-07 15:24:19.874628+02:00
45522021-05-07 15:24:19.875421+02:002021-05-07 15:24:19.875421+02:00
..................
174184236102023-02-17 16:35:25.041114+01:002023-02-17 16:35:25.041114+01:00
175185237172023-02-17 16:39:10.799478+01:002023-02-17 16:39:10.799478+01:00
176186238192023-02-17 16:53:21.098690+01:002023-02-17 16:53:21.098690+01:00
177187239102023-02-17 16:57:42.623481+01:002023-02-17 16:57:42.623481+01:00
178188240102023-02-17 16:59:22.067723+01:002023-02-17 16:59:22.067723+01:00
\n", "

179 rows × 5 columns

\n", "
" ], "text/plain": [ " id structure_id tag_id created_at \\\n", "0 123 187 6 2023-01-27 16:03:59.680222+01:00 \n", "1 2 2 2 2021-05-07 15:24:19.872895+02:00 \n", "2 3 3 2 2021-05-07 15:24:19.873830+02:00 \n", "3 4 4 2 2021-05-07 15:24:19.874628+02:00 \n", "4 5 5 2 2021-05-07 15:24:19.875421+02:00 \n", ".. ... ... ... ... \n", "174 184 236 10 2023-02-17 16:35:25.041114+01:00 \n", "175 185 237 17 2023-02-17 16:39:10.799478+01:00 \n", "176 186 238 19 2023-02-17 16:53:21.098690+01:00 \n", "177 187 239 10 2023-02-17 16:57:42.623481+01:00 \n", "178 188 240 10 2023-02-17 16:59:22.067723+01:00 \n", "\n", " updated_at \n", "0 2023-01-27 16:03:59.680222+01:00 \n", "1 2021-05-07 15:24:19.872895+02:00 \n", "2 2021-05-07 15:24:19.873830+02:00 \n", "3 2021-05-07 15:24:19.874628+02:00 \n", "4 2021-05-07 15:24:19.875421+02:00 \n", ".. ... \n", "174 2023-02-17 16:35:25.041114+01:00 \n", "175 2023-02-17 16:39:10.799478+01:00 \n", "176 2023-02-17 16:53:21.098690+01:00 \n", "177 2023-02-17 16:57:42.623481+01:00 \n", "178 2023-02-17 16:59:22.067723+01:00 \n", "\n", "[179 rows x 5 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "structure_tag_mappings" ] }, { "cell_type": "code", "execution_count": 24, "id": "41bf1529-5a7c-409e-9791-2024c08c11f0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", " 'average_purchase_delay', 'average_price_basket',\n", " 'average_ticket_basket', 'total_price', 'preferred_category',\n", " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", " 'tenant_id'],\n", " dtype='object')\n", "(71307, 43)\n", "\n", "RangeIndex: 71307 entries, 0 to 71306\n", "Data columns (total 43 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 71307 non-null int64 \n", " 1 lastname 41045 non-null object \n", " 2 firstname 39140 non-null object \n", " 3 birthdate 18174 non-null object \n", " 4 email 58203 non-null object \n", " 5 street_id 71307 non-null int64 \n", " 6 created_at 71307 non-null object \n", " 7 updated_at 71307 non-null object \n", " 8 civility 0 non-null float64\n", " 9 is_partner 71307 non-null bool \n", " 10 extra 0 non-null float64\n", " 11 deleted_at 0 non-null float64\n", " 12 reference 0 non-null float64\n", " 13 gender 71307 non-null int64 \n", " 14 is_email_true 71307 non-null bool \n", " 15 extra_field 0 non-null float64\n", " 16 identifier 71307 non-null object \n", " 17 opt_in 71307 non-null bool \n", " 18 structure_id 616 non-null float64\n", " 19 note 451 non-null object \n", " 20 profession 812 non-null object \n", " 21 language 0 non-null float64\n", " 22 mcp_contact_id 22417 non-null float64\n", " 23 need_reload 71307 non-null bool \n", " 24 last_buying_date 34040 non-null object \n", " 25 max_price 34040 non-null float64\n", " 26 ticket_sum 71307 non-null int64 \n", " 27 average_price 68694 non-null float64\n", " 28 fidelity 71307 non-null int64 \n", " 29 average_purchase_delay 34040 non-null float64\n", " 30 average_price_basket 34040 non-null float64\n", " 31 average_ticket_basket 34040 non-null float64\n", " 32 total_price 36653 non-null float64\n", " 33 preferred_category 0 non-null float64\n", " 34 preferred_supplier 0 non-null float64\n", " 35 preferred_formula 0 non-null float64\n", " 36 purchase_count 71307 non-null int64 \n", " 37 first_buying_date 34040 non-null object \n", " 38 last_visiting_date 0 non-null float64\n", " 39 zipcode 33756 non-null object \n", " 40 country 39910 non-null object \n", " 41 age 18174 non-null float64\n", " 42 tenant_id 71307 non-null int64 \n", "dtypes: bool(4), float64(19), int64(7), object(13)\n", "memory usage: 21.5+ MB\n" ] } ], "source": [ "# Tags = clients\n", "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " customersplus = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(customersplus.columns)\n", "print(customersplus.shape)\n", "customersplus.info()" ] }, { "cell_type": "code", "execution_count": 25, "id": "948a0b2b-8d1c-4afb-802e-670d67dd8c20", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlastnamefirstnamebirthdateemailstreet_idcreated_atupdated_atcivilityis_partner...preferred_categorypreferred_supplierpreferred_formulapurchase_countfirst_buying_datelast_visiting_datezipcodecountryagetenant_id
0286834lastname286834firstname286834NaNemail28683462022-05-19 10:09:09.361137+02:002022-05-19 10:09:09.361137+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNfrNaN1556
1330695NaNNaNNaNemail33069512022-07-16 04:10:34.135134+02:002022-07-16 04:10:34.156704+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
2330978NaNNaNNaNemail33097812022-07-21 22:14:09.811721+02:002022-07-21 22:14:09.836051+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
3338697NaNNaNNaNemail33869712022-09-15 19:02:03.950536+02:002022-09-15 19:02:03.985642+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
4338726NaNNaNNaNemail33872612022-09-16 01:24:40.719882+02:002022-09-16 01:24:40.742753+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
..................................................................
7130227105lastname27105firstname271051957-01-26email271052050242021-04-22 15:12:59.986534+02:002023-09-12 18:59:31.613235+02:00NaNFalse...NaNNaNNaN22018-12-31 18:56:57+01:00NaN35700fr66.01556
7130327108lastname27108firstname27108NaNNaN2050242021-04-22 15:12:59.989197+02:002023-09-12 18:27:34.380843+02:00NaNFalse...NaNNaNNaN62015-12-29 14:51:46+01:00NaN35700frNaN1556
7130427110lastname27110firstname27110NaNNaN62021-04-22 15:12:59.991029+02:002022-04-14 11:41:33.738500+02:00NaNFalse...NaNNaNNaN12018-12-31 19:12:59+01:00NaNNaNfrNaN1556
7130510607lastname10607firstname106071963-01-04email106073133322021-04-22 14:56:45.742226+02:002023-09-12 17:55:17.723195+02:00NaNFalse...NaNNaNNaN262015-10-10 14:11:21+02:00NaN35850fr60.01556
7130619095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...NaNNaNNaN22019-05-19 21:18:36+02:00NaNNaNfr44.01556
\n", "

71307 rows × 43 columns

\n", "
" ], "text/plain": [ " id lastname firstname birthdate email \\\n", "0 286834 lastname286834 firstname286834 NaN email286834 \n", "1 330695 NaN NaN NaN email330695 \n", "2 330978 NaN NaN NaN email330978 \n", "3 338697 NaN NaN NaN email338697 \n", "4 338726 NaN NaN NaN email338726 \n", "... ... ... ... ... ... \n", "71302 27105 lastname27105 firstname27105 1957-01-26 email27105 \n", "71303 27108 lastname27108 firstname27108 NaN NaN \n", "71304 27110 lastname27110 firstname27110 NaN NaN \n", "71305 10607 lastname10607 firstname10607 1963-01-04 email10607 \n", "71306 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", "\n", " street_id created_at \\\n", "0 6 2022-05-19 10:09:09.361137+02:00 \n", "1 1 2022-07-16 04:10:34.135134+02:00 \n", "2 1 2022-07-21 22:14:09.811721+02:00 \n", "3 1 2022-09-15 19:02:03.950536+02:00 \n", "4 1 2022-09-16 01:24:40.719882+02:00 \n", "... ... ... \n", "71302 205024 2021-04-22 15:12:59.986534+02:00 \n", "71303 205024 2021-04-22 15:12:59.989197+02:00 \n", "71304 6 2021-04-22 15:12:59.991029+02:00 \n", "71305 313332 2021-04-22 14:56:45.742226+02:00 \n", "71306 6 2021-04-22 15:06:30.120537+02:00 \n", "\n", " updated_at civility is_partner ... \\\n", "0 2022-05-19 10:09:09.361137+02:00 NaN False ... \n", "1 2022-07-16 04:10:34.156704+02:00 NaN False ... \n", "2 2022-07-21 22:14:09.836051+02:00 NaN False ... \n", "3 2022-09-15 19:02:03.985642+02:00 NaN False ... \n", "4 2022-09-16 01:24:40.742753+02:00 NaN False ... \n", "... ... ... ... ... \n", "71302 2023-09-12 18:59:31.613235+02:00 NaN False ... \n", "71303 2023-09-12 18:27:34.380843+02:00 NaN False ... \n", "71304 2022-04-14 11:41:33.738500+02:00 NaN False ... \n", "71305 2023-09-12 17:55:17.723195+02:00 NaN False ... \n", "71306 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", "\n", " preferred_category preferred_supplier preferred_formula \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "... ... ... ... \n", "71302 NaN NaN NaN \n", "71303 NaN NaN NaN \n", "71304 NaN NaN NaN \n", "71305 NaN NaN NaN \n", "71306 NaN NaN NaN \n", "\n", " purchase_count first_buying_date last_visiting_date zipcode \\\n", "0 0 NaN NaN NaN \n", "1 0 NaN NaN NaN \n", "2 0 NaN NaN NaN \n", "3 0 NaN NaN NaN \n", "4 0 NaN NaN NaN \n", "... ... ... ... ... \n", "71302 2 2018-12-31 18:56:57+01:00 NaN 35700 \n", "71303 6 2015-12-29 14:51:46+01:00 NaN 35700 \n", "71304 1 2018-12-31 19:12:59+01:00 NaN NaN \n", "71305 26 2015-10-10 14:11:21+02:00 NaN 35850 \n", "71306 2 2019-05-19 21:18:36+02:00 NaN NaN \n", "\n", " country age tenant_id \n", "0 fr NaN 1556 \n", "1 NaN NaN 1556 \n", "2 NaN NaN 1556 \n", "3 NaN NaN 1556 \n", "4 NaN NaN 1556 \n", "... ... ... ... \n", "71302 fr 66.0 1556 \n", "71303 fr NaN 1556 \n", "71304 fr NaN 1556 \n", "71305 fr 60.0 1556 \n", "71306 fr 44.0 1556 \n", "\n", "[71307 rows x 43 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customersplus" ] }, { "cell_type": "code", "execution_count": 6, "id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2", "metadata": {}, "outputs": [], "source": [ "# But : lier les caractéristiques socio-demo et les comportements d'achat\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "8259ae6c-353f-43a6-add3-f974fac6e5d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'number', 'created_at', 'updated_at', 'purchase_id', 'product_id',\n", " 'is_from_subscription', 'type_of', 'supplier_id', 'barcode',\n", " 'identifier'],\n", " dtype='object')\n", "(318969, 11)\n", "\n", "RangeIndex: 318969 entries, 0 to 318968\n", "Data columns (total 11 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 318969 non-null int64 \n", " 1 number 318969 non-null object \n", " 2 created_at 318969 non-null object \n", " 3 updated_at 318969 non-null object \n", " 4 purchase_id 318969 non-null int64 \n", " 5 product_id 318969 non-null int64 \n", " 6 is_from_subscription 318969 non-null bool \n", " 7 type_of 318969 non-null int64 \n", " 8 supplier_id 318969 non-null int64 \n", " 9 barcode 0 non-null float64\n", " 10 identifier 318969 non-null object \n", "dtypes: bool(1), float64(1), int64(5), object(4)\n", "memory usage: 24.6+ MB\n" ] } ], "source": [ "# tickets\n", "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " tickets = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(tickets.columns)\n", "print(tickets.shape)\n", "tickets.info()" ] }, { "cell_type": "code", "execution_count": 30, "id": "f54830cb-1f95-4f71-9b04-358c745fb454", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnumbercreated_atupdated_atpurchase_idproduct_idis_from_subscriptiontype_ofsupplier_idbarcodeidentifier
021190811433_136_212_683562023-09-12 17:42:45.396336+02:002023-09-12 17:42:45.396336+02:00861764209879False11702NaNf694c255855ce5643c6fcc7fed5e9237
121190821433_136_194_683562023-09-12 17:42:45.409056+02:002023-09-12 17:42:45.409056+02:00861763209879False11702NaN838d6101db2fc8bc80536d8b91b49859
2211908333158_158_343_683572023-09-12 17:42:45.409824+02:002023-09-12 17:42:45.409824+02:00861769209880False11702NaN8a8d938d66a4dc57bcb44c2773c6fdfa
3211908433158_158_297_683572023-09-12 17:42:45.410447+02:002023-09-12 17:42:45.410447+02:00861767209880False11702NaNb7a3dd0794c0957c942d45b8913e5b96
4211908533158_158_318_683572023-09-12 17:42:45.411059+02:002023-09-12 17:42:45.411059+02:00861768209880False11702NaNd7ea7e443581ebe520dd13f6cad31af7
....................................
318964256402144247_204_239_892782023-09-12 18:59:48.750953+02:002023-09-12 18:59:48.750953+02:001244281210158False11702NaN82c9af8b2167f7ac34a5e834242b0239
318965256402244247_204_299_892782023-09-12 18:59:48.751441+02:002023-09-12 18:59:48.751441+02:001244284210158False11702NaN235e8e608f066cb72949bbd397d0a76f
318966256402344247_204_259_892782023-09-12 18:59:48.751924+02:002023-09-12 18:59:48.751924+02:001244282210158False11702NaNec22fa828931f030f7e79a4cc5478c4b
318967256402444247_204_279_892782023-09-12 18:59:48.752425+02:002023-09-12 18:59:48.752425+02:001244283210158False11702NaN31ec4deaf718e04caf193e1ff8d621ef
31896825131564854_178_2847_891702023-09-12 18:52:20.331807+02:002023-09-12 18:59:48.752904+02:001244285261922False31702NaN48aef9efab29bfb1537656908863bcc1
\n", "

318969 rows × 11 columns

\n", "
" ], "text/plain": [ " id number created_at \\\n", "0 2119081 1433_136_212_68356 2023-09-12 17:42:45.396336+02:00 \n", "1 2119082 1433_136_194_68356 2023-09-12 17:42:45.409056+02:00 \n", "2 2119083 33158_158_343_68357 2023-09-12 17:42:45.409824+02:00 \n", "3 2119084 33158_158_297_68357 2023-09-12 17:42:45.410447+02:00 \n", "4 2119085 33158_158_318_68357 2023-09-12 17:42:45.411059+02:00 \n", "... ... ... ... \n", "318964 2564021 44247_204_239_89278 2023-09-12 18:59:48.750953+02:00 \n", "318965 2564022 44247_204_299_89278 2023-09-12 18:59:48.751441+02:00 \n", "318966 2564023 44247_204_259_89278 2023-09-12 18:59:48.751924+02:00 \n", "318967 2564024 44247_204_279_89278 2023-09-12 18:59:48.752425+02:00 \n", "318968 2513156 4854_178_2847_89170 2023-09-12 18:52:20.331807+02:00 \n", "\n", " updated_at purchase_id product_id \\\n", "0 2023-09-12 17:42:45.396336+02:00 861764 209879 \n", "1 2023-09-12 17:42:45.409056+02:00 861763 209879 \n", "2 2023-09-12 17:42:45.409824+02:00 861769 209880 \n", "3 2023-09-12 17:42:45.410447+02:00 861767 209880 \n", "4 2023-09-12 17:42:45.411059+02:00 861768 209880 \n", "... ... ... ... \n", "318964 2023-09-12 18:59:48.750953+02:00 1244281 210158 \n", "318965 2023-09-12 18:59:48.751441+02:00 1244284 210158 \n", "318966 2023-09-12 18:59:48.751924+02:00 1244282 210158 \n", "318967 2023-09-12 18:59:48.752425+02:00 1244283 210158 \n", "318968 2023-09-12 18:59:48.752904+02:00 1244285 261922 \n", "\n", " is_from_subscription type_of supplier_id barcode \\\n", "0 False 1 1702 NaN \n", "1 False 1 1702 NaN \n", "2 False 1 1702 NaN \n", "3 False 1 1702 NaN \n", "4 False 1 1702 NaN \n", "... ... ... ... ... \n", "318964 False 1 1702 NaN \n", "318965 False 1 1702 NaN \n", "318966 False 1 1702 NaN \n", "318967 False 1 1702 NaN \n", "318968 False 3 1702 NaN \n", "\n", " identifier \n", "0 f694c255855ce5643c6fcc7fed5e9237 \n", "1 838d6101db2fc8bc80536d8b91b49859 \n", "2 8a8d938d66a4dc57bcb44c2773c6fdfa \n", "3 b7a3dd0794c0957c942d45b8913e5b96 \n", "4 d7ea7e443581ebe520dd13f6cad31af7 \n", "... ... \n", "318964 82c9af8b2167f7ac34a5e834242b0239 \n", "318965 235e8e608f066cb72949bbd397d0a76f \n", "318966 ec22fa828931f030f7e79a4cc5478c4b \n", "318967 31ec4deaf718e04caf193e1ff8d621ef \n", "318968 48aef9efab29bfb1537656908863bcc1 \n", "\n", "[318969 rows x 11 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tickets" ] }, { "cell_type": "code", "execution_count": 33, "id": "ad743347-33d1-41f0-852d-f9e6354f82ed", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 0])" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tickets['type_of'].unique()" ] }, { "cell_type": "markdown", "id": "b88808fe-3b4e-49ed-9885-d52910b6f211", "metadata": {}, "source": [ "## Types d'évenement et client" ] }, { "cell_type": "code", "execution_count": 6, "id": "ecb03a47-1418-4fb1-8c78-cd222d38b7fd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'created_at', 'updated_at', 'season_id', 'facility_id', 'name',\n", " 'event_type_id', 'manual_added', 'is_display', 'event_type_key_id',\n", " 'facility_key_id', 'identifier'],\n", " dtype='object')\n", "(403, 12)\n", "\n", "RangeIndex: 403 entries, 0 to 402\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 403 non-null int64 \n", " 1 created_at 403 non-null object\n", " 2 updated_at 403 non-null object\n", " 3 season_id 403 non-null int64 \n", " 4 facility_id 403 non-null int64 \n", " 5 name 403 non-null object\n", " 6 event_type_id 403 non-null int64 \n", " 7 manual_added 403 non-null bool \n", " 8 is_display 403 non-null bool \n", " 9 event_type_key_id 403 non-null int64 \n", " 10 facility_key_id 403 non-null int64 \n", " 11 identifier 403 non-null object\n", "dtypes: bool(2), int64(6), object(4)\n", "memory usage: 32.4+ KB\n" ] } ], "source": [ "# Evenement = events.csv\n", "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " events = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(events.columns)\n", "print(events.shape)\n", "events.info()" ] }, { "cell_type": "code", "execution_count": 7, "id": "19706610-9e90-4e6f-8bd0-da124b87cff7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcreated_atupdated_atseason_idfacility_idnameevent_type_idmanual_addedis_displayevent_type_key_idfacility_key_ididentifier
0203672023-09-13 03:42:45.214293+02:002023-09-13 03:54:30.086969+02:0018651054marelle1055FalseTrue1055105426d1e9a4acad18b9cf79244334c86c93
1203712023-09-13 03:42:45.218728+02:002023-09-13 03:54:30.103943+02:0018651054dialogues1055FalseTrue1055105460356fc5e8ed6c9c1be9c5ec67e77766
2205702023-10-05 04:48:29.374504+02:002023-10-05 04:48:36.562528+02:0018651054les grandes epopees1055FalseTrue10551054f8ab088e06252bf34e1b12ad2ce1a403
3207572023-11-01 03:55:20.846196+01:002023-11-01 03:55:28.412457+01:0018651054scolaire marelle1055FalseTrue10551054447fa80f9a793b7587bb85ebbda6442c
4203642023-09-13 03:42:45.196791+02:002023-09-13 03:54:30.075456+02:0018651054le couronnement de poppee1055FalseTrue105510543b37f5d2cd354cbc422868621ac7ebc2
.......................................
398156032023-09-12 17:42:25.327618+02:002023-09-12 19:00:00.893400+02:0017061054marelle1055FalseTrue10551054fde88b72fb82b1fe42fbbfbfc3d6b4d3
399156212023-09-12 17:42:25.335792+02:002023-09-12 19:00:00.899622+02:0017081054cartes d'adhesion1055FalseTrue10551054051b96aad2b720bad4450a59ed7dfbf6
400157402023-09-12 17:47:05.112101+02:002023-09-12 19:00:00.906123+02:0017111054repetition le medecin malgre lui1055FalseTrue10551054addd6885bea5ddf60ec3539dfc3e79e8
401155202023-09-12 17:42:25.290280+02:002023-09-12 19:00:00.835625+02:0017081054opera au village1055FalseTrue1055105494f250d10d4a56358ceab23b384439ff
402154392023-09-12 17:42:25.252747+02:002023-09-12 19:00:00.735990+02:0017081054florilege1055FalseTrue105510544f015946bcbd856aa573cadb7ac42b9f
\n", "

403 rows × 12 columns

\n", "
" ], "text/plain": [ " id created_at \\\n", "0 20367 2023-09-13 03:42:45.214293+02:00 \n", "1 20371 2023-09-13 03:42:45.218728+02:00 \n", "2 20570 2023-10-05 04:48:29.374504+02:00 \n", "3 20757 2023-11-01 03:55:20.846196+01:00 \n", "4 20364 2023-09-13 03:42:45.196791+02:00 \n", ".. ... ... \n", "398 15603 2023-09-12 17:42:25.327618+02:00 \n", "399 15621 2023-09-12 17:42:25.335792+02:00 \n", "400 15740 2023-09-12 17:47:05.112101+02:00 \n", "401 15520 2023-09-12 17:42:25.290280+02:00 \n", "402 15439 2023-09-12 17:42:25.252747+02:00 \n", "\n", " updated_at season_id facility_id \\\n", "0 2023-09-13 03:54:30.086969+02:00 1865 1054 \n", "1 2023-09-13 03:54:30.103943+02:00 1865 1054 \n", "2 2023-10-05 04:48:36.562528+02:00 1865 1054 \n", "3 2023-11-01 03:55:28.412457+01:00 1865 1054 \n", "4 2023-09-13 03:54:30.075456+02:00 1865 1054 \n", ".. ... ... ... \n", "398 2023-09-12 19:00:00.893400+02:00 1706 1054 \n", "399 2023-09-12 19:00:00.899622+02:00 1708 1054 \n", "400 2023-09-12 19:00:00.906123+02:00 1711 1054 \n", "401 2023-09-12 19:00:00.835625+02:00 1708 1054 \n", "402 2023-09-12 19:00:00.735990+02:00 1708 1054 \n", "\n", " name event_type_id manual_added \\\n", "0 marelle 1055 False \n", "1 dialogues 1055 False \n", "2 les grandes epopees 1055 False \n", "3 scolaire marelle 1055 False \n", "4 le couronnement de poppee 1055 False \n", ".. ... ... ... \n", "398 marelle 1055 False \n", "399 cartes d'adhesion 1055 False \n", "400 repetition le medecin malgre lui 1055 False \n", "401 opera au village 1055 False \n", "402 florilege 1055 False \n", "\n", " is_display event_type_key_id facility_key_id \\\n", "0 True 1055 1054 \n", "1 True 1055 1054 \n", "2 True 1055 1054 \n", "3 True 1055 1054 \n", "4 True 1055 1054 \n", ".. ... ... ... \n", "398 True 1055 1054 \n", "399 True 1055 1054 \n", "400 True 1055 1054 \n", "401 True 1055 1054 \n", "402 True 1055 1054 \n", "\n", " identifier \n", "0 26d1e9a4acad18b9cf79244334c86c93 \n", "1 60356fc5e8ed6c9c1be9c5ec67e77766 \n", "2 f8ab088e06252bf34e1b12ad2ce1a403 \n", "3 447fa80f9a793b7587bb85ebbda6442c \n", "4 3b37f5d2cd354cbc422868621ac7ebc2 \n", ".. ... \n", "398 fde88b72fb82b1fe42fbbfbfc3d6b4d3 \n", "399 051b96aad2b720bad4450a59ed7dfbf6 \n", "400 addd6885bea5ddf60ec3539dfc3e79e8 \n", "401 94f250d10d4a56358ceab23b384439ff \n", "402 4f015946bcbd856aa573cadb7ac42b9f \n", "\n", "[403 rows x 12 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "events" ] }, { "cell_type": "code", "execution_count": 15, "id": "6cb04679-26e7-4ed8-bfc1-42285da96374", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "357" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "events['name'].nunique()" ] }, { "cell_type": "code", "execution_count": 16, "id": "c10297e8-a8f9-45f9-8553-17e3fdb6f8c1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'serial', 'event_id', 'created_at', 'updated_at',\n", " 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n", " 'is_display', 'representation_type_id', 'expected_filling',\n", " 'max_filling', 'extra_field', 'identifier'],\n", " dtype='object')\n", "(996, 16)\n", "\n", "RangeIndex: 996 entries, 0 to 995\n", "Data columns (total 16 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 996 non-null int64 \n", " 1 serial 0 non-null float64\n", " 2 event_id 996 non-null int64 \n", " 3 created_at 996 non-null object \n", " 4 updated_at 996 non-null object \n", " 5 start_date_time 996 non-null object \n", " 6 open 996 non-null bool \n", " 7 satisfaction 0 non-null float64\n", " 8 end_date_time 996 non-null object \n", " 9 name 0 non-null float64\n", " 10 is_display 996 non-null bool \n", " 11 representation_type_id 0 non-null float64\n", " 12 expected_filling 24 non-null float64\n", " 13 max_filling 24 non-null float64\n", " 14 extra_field 0 non-null float64\n", " 15 identifier 996 non-null object \n", "dtypes: bool(2), float64(7), int64(2), object(5)\n", "memory usage: 111.0+ KB\n" ] } ], "source": [ "# Représentation des évenements = representations.csv\n", "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " representations = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(representations.columns)\n", "print(representations.shape)\n", "representations.info()" ] }, { "cell_type": "code", "execution_count": 17, "id": "41ef6a1b-e99e-4c73-a2ae-ba7d438d90c2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idserialevent_idcreated_atupdated_atstart_date_timeopensatisfactionend_date_timenameis_displayrepresentation_type_idexpected_fillingmax_fillingextra_fieldidentifier
044351NaN203712023-09-13 03:42:45.245879+02:002023-09-13 03:42:45.245879+02:002023-12-21 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaN550.0550.0NaN33520762e8cc28982e3841cbc2be8ce2
145497NaN207572023-11-01 03:55:20.875712+01:002023-11-01 03:55:20.875712+01:002023-11-28 10:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN5c34b84e3d11276e0995d984c94cd28d
244383NaN203832023-09-13 10:41:08.964302+02:002023-09-13 10:41:08.964302+02:002023-06-04 17:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNbf3c65a1dfefbd747dcc2360e6887eac
344384NaN203832023-09-13 10:41:08.972401+02:002023-09-13 10:41:08.972401+02:002023-06-03 17:30:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNb0e69ae8b78ebab3066aac83de22d239
444385NaN203842023-09-13 10:41:08.973290+02:002023-09-13 10:41:08.973290+02:002023-06-03 16:15:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN9fb91c8b1cf9e444111c511e212ac5c1
...................................................
99133894NaN156472023-09-12 17:42:25.564297+02:002023-09-12 17:42:25.564297+02:002022-11-08 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN44bbcecfd007ceaad05805391beccabb
99233873NaN156402023-09-12 17:42:25.554863+02:002023-09-12 17:42:25.554863+02:002022-11-14 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN151edbec8e0a3cd80071038e857f3493
99333610NaN155202023-09-12 17:42:25.442979+02:002023-09-12 17:42:25.442979+02:002023-06-19 18:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN9e9e38d527427e1b6f67e0c3f12b82fc
99433953NaN155202023-09-12 17:42:25.590746+02:002023-09-12 17:42:25.590746+02:002023-06-19 20:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN7bf0978aabb6cac1bb4cd2784afb2b6b
99533639NaN155332023-09-12 17:42:25.455708+02:002023-09-12 17:42:25.455708+02:002023-04-15 17:30:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNfae68f1e09710ec8747957af6e22f61d
\n", "

996 rows × 16 columns

\n", "
" ], "text/plain": [ " id serial event_id created_at \\\n", "0 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n", "1 45497 NaN 20757 2023-11-01 03:55:20.875712+01:00 \n", "2 44383 NaN 20383 2023-09-13 10:41:08.964302+02:00 \n", "3 44384 NaN 20383 2023-09-13 10:41:08.972401+02:00 \n", "4 44385 NaN 20384 2023-09-13 10:41:08.973290+02:00 \n", ".. ... ... ... ... \n", "991 33894 NaN 15647 2023-09-12 17:42:25.564297+02:00 \n", "992 33873 NaN 15640 2023-09-12 17:42:25.554863+02:00 \n", "993 33610 NaN 15520 2023-09-12 17:42:25.442979+02:00 \n", "994 33953 NaN 15520 2023-09-12 17:42:25.590746+02:00 \n", "995 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n", "\n", " updated_at start_date_time open \\\n", "0 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n", "1 2023-11-01 03:55:20.875712+01:00 2023-11-28 10:00:00+01:00 True \n", "2 2023-09-13 10:41:08.964302+02:00 2023-06-04 17:00:00+02:00 True \n", "3 2023-09-13 10:41:08.972401+02:00 2023-06-03 17:30:00+02:00 True \n", "4 2023-09-13 10:41:08.973290+02:00 2023-06-03 16:15:00+02:00 True \n", ".. ... ... ... \n", "991 2023-09-12 17:42:25.564297+02:00 2022-11-08 20:00:00+01:00 True \n", "992 2023-09-12 17:42:25.554863+02:00 2022-11-14 20:00:00+01:00 True \n", "993 2023-09-12 17:42:25.442979+02:00 2023-06-19 18:00:00+02:00 True \n", "994 2023-09-12 17:42:25.590746+02:00 2023-06-19 20:00:00+02:00 True \n", "995 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n", "\n", " satisfaction end_date_time name is_display \\\n", "0 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "1 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "2 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "3 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "4 NaN 1901-01-01 00:09:21+00:09 NaN True \n", ".. ... ... ... ... \n", "991 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "992 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "993 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "994 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "995 NaN 1901-01-01 00:09:21+00:09 NaN True \n", "\n", " representation_type_id expected_filling max_filling extra_field \\\n", "0 NaN 550.0 550.0 NaN \n", "1 NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN \n", ".. ... ... ... ... \n", "991 NaN NaN NaN NaN \n", "992 NaN NaN NaN NaN \n", "993 NaN NaN NaN NaN \n", "994 NaN NaN NaN NaN \n", "995 NaN NaN NaN NaN \n", "\n", " identifier \n", "0 33520762e8cc28982e3841cbc2be8ce2 \n", "1 5c34b84e3d11276e0995d984c94cd28d \n", "2 bf3c65a1dfefbd747dcc2360e6887eac \n", "3 b0e69ae8b78ebab3066aac83de22d239 \n", "4 9fb91c8b1cf9e444111c511e212ac5c1 \n", ".. ... \n", "991 44bbcecfd007ceaad05805391beccabb \n", "992 151edbec8e0a3cd80071038e857f3493 \n", "993 9e9e38d527427e1b6f67e0c3f12b82fc \n", "994 7bf0978aabb6cac1bb4cd2784afb2b6b \n", "995 fae68f1e09710ec8747957af6e22f61d \n", "\n", "[996 rows x 16 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "representations" ] }, { "cell_type": "code", "execution_count": 18, "id": "ae6cdad3-2184-4ae7-928c-2f8bd7769a5b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'amount', 'is_full_price', 'representation_id',\n", " 'pricing_formula_id', 'created_at', 'updated_at', 'category_id',\n", " 'apply_price', 'products_group_id', 'product_pack_id', 'extra_field',\n", " 'amount_consumption', 'identifier'],\n", " dtype='object')\n", "(14648, 14)\n", "\n", "RangeIndex: 14648 entries, 0 to 14647\n", "Data columns (total 14 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 14648 non-null int64 \n", " 1 amount 14648 non-null float64\n", " 2 is_full_price 14648 non-null bool \n", " 3 representation_id 14648 non-null int64 \n", " 4 pricing_formula_id 14648 non-null int64 \n", " 5 created_at 14648 non-null object \n", " 6 updated_at 14648 non-null object \n", " 7 category_id 14648 non-null int64 \n", " 8 apply_price 14648 non-null float64\n", " 9 products_group_id 14648 non-null int64 \n", " 10 product_pack_id 14648 non-null int64 \n", " 11 extra_field 0 non-null float64\n", " 12 amount_consumption 0 non-null float64\n", " 13 identifier 14648 non-null object \n", "dtypes: bool(1), float64(4), int64(6), object(3)\n", "memory usage: 1.5+ MB\n" ] } ], "source": [ "# Produits vendues = products.csv\n", "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " products = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(products.columns)\n", "print(products.shape)\n", "products.info()" ] }, { "cell_type": "code", "execution_count": 19, "id": "34f1825d-148a-4a6e-88d6-61449fee3ee4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idamountis_full_pricerepresentation_idpricing_formula_idcreated_atupdated_atcategory_idapply_priceproducts_group_idproduct_pack_idextra_fieldamount_consumptionidentifier
026832518.0False44332204772023-09-13 03:42:45.415594+02:002023-09-13 03:42:45.415594+02:0049720.02681081NaNNaNb823bbea3ba837da2ef8efaf1287272d
127411836.8False44340205022023-10-25 03:26:57.430694+02:002023-10-25 03:26:57.430694+02:0049690.02739011NaNNaN81e8b7991f6948e3ef7cfe5011d13532
226833839.1False44340204972023-09-13 03:42:45.430942+02:002023-09-13 03:42:45.430942+02:0049690.02681211NaNNaNbe8bc0399db4d04aefa9f44afd4d5efa
32098830.0False33443204752023-09-12 17:42:27.595998+02:002023-09-12 17:42:27.595998+02:0049700.02097061NaNNaN01a9eea5f8ad53491faa864bfac44183
426832663.0False44333204772023-09-13 03:42:45.417283+02:002023-09-13 03:42:45.417283+02:0049690.02681091NaNNaN781a917ecfdabb14169701d7b143bbe4
.............................................
1464321787833.6False33919204892023-09-12 17:51:11.572882+02:002023-09-12 17:51:11.572882+02:0049710.02176951NaNNaN82bba69321466069411b3023343b44a4
1464426831510.0False33919205042023-09-12 18:59:29.995176+02:002023-09-12 18:59:29.995176+02:0049690.02680981NaNNaNeae56a8eb0a4315c5713b2053103d595
146452101485.0False33531204732023-09-12 17:42:27.733260+02:002023-09-12 17:42:27.733260+02:0049750.02099711NaNNaN449f86c1ef2b478d3389f7d0e27d0e6b
1464621205430.0False33810204732023-09-12 17:42:28.724681+02:002023-09-12 17:42:28.724681+02:0049720.02118761NaNNaN2090203e2c0b58ea8f505089faee6d62
1464726192221.0False33766204882023-09-12 18:52:00.519838+02:002023-09-12 18:52:00.519838+02:0049720.02617091NaNNaN9139ee36a92bed766ae95372cca77336
\n", "

14648 rows × 14 columns

\n", "
" ], "text/plain": [ " id amount is_full_price representation_id pricing_formula_id \\\n", "0 268325 18.0 False 44332 20477 \n", "1 274118 36.8 False 44340 20502 \n", "2 268338 39.1 False 44340 20497 \n", "3 209883 0.0 False 33443 20475 \n", "4 268326 63.0 False 44333 20477 \n", "... ... ... ... ... ... \n", "14643 217878 33.6 False 33919 20489 \n", "14644 268315 10.0 False 33919 20504 \n", "14645 210148 5.0 False 33531 20473 \n", "14646 212054 30.0 False 33810 20473 \n", "14647 261922 21.0 False 33766 20488 \n", "\n", " created_at updated_at \\\n", "0 2023-09-13 03:42:45.415594+02:00 2023-09-13 03:42:45.415594+02:00 \n", "1 2023-10-25 03:26:57.430694+02:00 2023-10-25 03:26:57.430694+02:00 \n", "2 2023-09-13 03:42:45.430942+02:00 2023-09-13 03:42:45.430942+02:00 \n", "3 2023-09-12 17:42:27.595998+02:00 2023-09-12 17:42:27.595998+02:00 \n", "4 2023-09-13 03:42:45.417283+02:00 2023-09-13 03:42:45.417283+02:00 \n", "... ... ... \n", "14643 2023-09-12 17:51:11.572882+02:00 2023-09-12 17:51:11.572882+02:00 \n", "14644 2023-09-12 18:59:29.995176+02:00 2023-09-12 18:59:29.995176+02:00 \n", "14645 2023-09-12 17:42:27.733260+02:00 2023-09-12 17:42:27.733260+02:00 \n", "14646 2023-09-12 17:42:28.724681+02:00 2023-09-12 17:42:28.724681+02:00 \n", "14647 2023-09-12 18:52:00.519838+02:00 2023-09-12 18:52:00.519838+02:00 \n", "\n", " category_id apply_price products_group_id product_pack_id \\\n", "0 4972 0.0 268108 1 \n", "1 4969 0.0 273901 1 \n", "2 4969 0.0 268121 1 \n", "3 4970 0.0 209706 1 \n", "4 4969 0.0 268109 1 \n", "... ... ... ... ... \n", "14643 4971 0.0 217695 1 \n", "14644 4969 0.0 268098 1 \n", "14645 4975 0.0 209971 1 \n", "14646 4972 0.0 211876 1 \n", "14647 4972 0.0 261709 1 \n", "\n", " extra_field amount_consumption identifier \n", "0 NaN NaN b823bbea3ba837da2ef8efaf1287272d \n", "1 NaN NaN 81e8b7991f6948e3ef7cfe5011d13532 \n", "2 NaN NaN be8bc0399db4d04aefa9f44afd4d5efa \n", "3 NaN NaN 01a9eea5f8ad53491faa864bfac44183 \n", "4 NaN NaN 781a917ecfdabb14169701d7b143bbe4 \n", "... ... ... ... \n", "14643 NaN NaN 82bba69321466069411b3023343b44a4 \n", "14644 NaN NaN eae56a8eb0a4315c5713b2053103d595 \n", "14645 NaN NaN 449f86c1ef2b478d3389f7d0e27d0e6b \n", "14646 NaN NaN 2090203e2c0b58ea8f505089faee6d62 \n", "14647 NaN NaN 9139ee36a92bed766ae95372cca77336 \n", "\n", "[14648 rows x 14 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "products" ] }, { "cell_type": "code", "execution_count": 20, "id": "6735b338-26b5-479d-825d-677ea533dad5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'name', 'created_at', 'updated_at', 'street_id', 'fixed_capacity',\n", " 'identifier'],\n", " dtype='object')\n", "(1, 7)\n", "\n", "RangeIndex: 1 entries, 0 to 0\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 1 non-null int64 \n", " 1 name 0 non-null float64\n", " 2 created_at 1 non-null object \n", " 3 updated_at 1 non-null object \n", " 4 street_id 1 non-null int64 \n", " 5 fixed_capacity 0 non-null float64\n", " 6 identifier 1 non-null object \n", "dtypes: float64(2), int64(2), object(3)\n", "memory usage: 184.0+ bytes\n" ] } ], "source": [ "# Lieu = facilities.csv\n", "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " facilities = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(facilities.columns)\n", "print(facilities.shape)\n", "facilities.info()" ] }, { "cell_type": "code", "execution_count": 21, "id": "428b86c2-50f4-42a5-9bbb-a17ffe820bf9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnamecreated_atupdated_atstreet_idfixed_capacityidentifier
01054NaN2023-09-12 17:42:25.223064+02:002023-09-12 17:42:25.223064+02:001NaNd41d8cd98f00b204e9800998ecf8427e
\n", "
" ], "text/plain": [ " id name created_at \\\n", "0 1054 NaN 2023-09-12 17:42:25.223064+02:00 \n", "\n", " updated_at street_id fixed_capacity \\\n", "0 2023-09-12 17:42:25.223064+02:00 1 NaN \n", "\n", " identifier \n", "0 d41d8cd98f00b204e9800998ecf8427e " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "facilities" ] }, { "cell_type": "code", "execution_count": 22, "id": "f6b26ad5-a4cc-4219-a0b0-406d9b025458", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'name', 'created_at', 'updated_at', 'start_date_time',\n", " 'identifier'],\n", " dtype='object')\n", "(9, 6)\n", "\n", "RangeIndex: 9 entries, 0 to 8\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 9 non-null int64 \n", " 1 name 9 non-null object \n", " 2 created_at 9 non-null object \n", " 3 updated_at 9 non-null object \n", " 4 start_date_time 0 non-null float64\n", " 5 identifier 9 non-null object \n", "dtypes: float64(1), int64(1), object(4)\n", "memory usage: 560.0+ bytes\n" ] } ], "source": [ "# Saisons = seasons.csv période sur deux années consécutives\n", "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " seasons = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(seasons.columns)\n", "print(seasons.shape)\n", "seasons.info()" ] }, { "cell_type": "code", "execution_count": 24, "id": "75c8c0ef-4ff5-45b1-a791-8ba2e9a4437e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['saison 2023-2024', 'saison 2021-2022', 'saison 2015-2016',\n", " 'saison 2016-2017', 'saison 2017-2018', 'saison 2018-2019',\n", " 'saison 2020-2021', 'saison 2019-2020', 'saison 2022-2023'],\n", " dtype=object)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seasons['name'].unique()" ] }, { "cell_type": "code", "execution_count": 25, "id": "cd0d10df-10cc-4f75-8b88-35f676c91f5b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['id', 'purchase_date', 'customer_id', 'created_at', 'updated_at',\n", " 'number', 'identifier'],\n", " dtype='object')\n", "(410695, 7)\n", "\n", "RangeIndex: 410695 entries, 0 to 410694\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 410695 non-null int64 \n", " 1 purchase_date 410695 non-null object \n", " 2 customer_id 410695 non-null int64 \n", " 3 created_at 410695 non-null object \n", " 4 updated_at 410695 non-null object \n", " 5 number 0 non-null float64\n", " 6 identifier 410695 non-null object \n", "dtypes: float64(1), int64(2), object(4)\n", "memory usage: 21.9+ MB\n" ] } ], "source": [ "# Achats = purchases.csv \n", "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " purchases = pd.read_csv(file_in, sep=\",\")\n", "\n", "print(purchases.columns)\n", "print(purchases.shape)\n", "purchases.info()" ] }, { "cell_type": "code", "execution_count": 28, "id": "8f986fdb-ca37-4cbb-b526-2a6d0ce7ca2c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idpurchase_datecustomer_idcreated_atupdated_atnumberidentifier
08617612019-03-01 16:28:49+01:0049662023-09-12 17:42:37.564150+02:002023-09-12 17:42:37.564150+02:00NaNd20eb0c3a7efec0bbe338dee40dc3378
18617622019-03-01 16:29:11+01:0049662023-09-12 17:42:37.571159+02:002023-09-12 17:42:37.571159+02:00NaNcff3abfc018517bce5ccfc58f5cacf40
28617632019-03-01 16:29:17+01:0049662023-09-12 17:42:37.571646+02:002023-09-12 17:42:37.571646+02:00NaNe1155cf26b34f792bdb23e49244d7264
38617642019-03-01 16:29:19+01:0049662023-09-12 17:42:37.572063+02:002023-09-12 17:42:37.572063+02:00NaNe8b95cc6a1a8b103ffa39755ce3bfc4d
48617652019-03-01 16:32:08+01:004059942023-09-12 17:42:37.572470+02:002023-09-12 17:42:37.572470+02:00NaN1b763278914f1309e357abe5033a3f0f
........................
41069012859642023-10-21 21:46:41+02:005173092023-10-23 03:43:16.457501+02:002023-10-23 03:43:16.457501+02:00NaN72c4e90c2b151dcffc87b19ea8a0c4f1
41069112859652023-10-21 21:47:07+02:005173092023-10-23 03:43:16.458458+02:002023-10-23 03:43:16.458458+02:00NaNee65532087132145daa6154fbae050ea
41069212859662023-10-21 21:47:20+02:005173092023-10-23 03:43:16.458811+02:002023-10-23 03:43:16.458811+02:00NaN7e825dd352bc6a11ab81cb8068e325e6
41069312859672023-10-21 23:07:06+02:003999692023-10-23 03:43:16.459738+02:002023-10-23 03:43:16.459738+02:00NaNfdb92627a48d6ba8fa817d60a83dbea8
41069412859682023-10-21 23:07:39+02:003999692023-10-23 03:43:16.462409+02:002023-10-23 03:43:16.462409+02:00NaNe9dbaff4f7037a5b0efa11263584dfad
\n", "

410695 rows × 7 columns

\n", "
" ], "text/plain": [ " id purchase_date customer_id \\\n", "0 861761 2019-03-01 16:28:49+01:00 4966 \n", "1 861762 2019-03-01 16:29:11+01:00 4966 \n", "2 861763 2019-03-01 16:29:17+01:00 4966 \n", "3 861764 2019-03-01 16:29:19+01:00 4966 \n", "4 861765 2019-03-01 16:32:08+01:00 405994 \n", "... ... ... ... \n", "410690 1285964 2023-10-21 21:46:41+02:00 517309 \n", "410691 1285965 2023-10-21 21:47:07+02:00 517309 \n", "410692 1285966 2023-10-21 21:47:20+02:00 517309 \n", "410693 1285967 2023-10-21 23:07:06+02:00 399969 \n", "410694 1285968 2023-10-21 23:07:39+02:00 399969 \n", "\n", " created_at updated_at \\\n", "0 2023-09-12 17:42:37.564150+02:00 2023-09-12 17:42:37.564150+02:00 \n", "1 2023-09-12 17:42:37.571159+02:00 2023-09-12 17:42:37.571159+02:00 \n", "2 2023-09-12 17:42:37.571646+02:00 2023-09-12 17:42:37.571646+02:00 \n", "3 2023-09-12 17:42:37.572063+02:00 2023-09-12 17:42:37.572063+02:00 \n", "4 2023-09-12 17:42:37.572470+02:00 2023-09-12 17:42:37.572470+02:00 \n", "... ... ... \n", "410690 2023-10-23 03:43:16.457501+02:00 2023-10-23 03:43:16.457501+02:00 \n", "410691 2023-10-23 03:43:16.458458+02:00 2023-10-23 03:43:16.458458+02:00 \n", "410692 2023-10-23 03:43:16.458811+02:00 2023-10-23 03:43:16.458811+02:00 \n", "410693 2023-10-23 03:43:16.459738+02:00 2023-10-23 03:43:16.459738+02:00 \n", "410694 2023-10-23 03:43:16.462409+02:00 2023-10-23 03:43:16.462409+02:00 \n", "\n", " number identifier \n", "0 NaN d20eb0c3a7efec0bbe338dee40dc3378 \n", "1 NaN cff3abfc018517bce5ccfc58f5cacf40 \n", "2 NaN e1155cf26b34f792bdb23e49244d7264 \n", "3 NaN e8b95cc6a1a8b103ffa39755ce3bfc4d \n", "4 NaN 1b763278914f1309e357abe5033a3f0f \n", "... ... ... \n", "410690 NaN 72c4e90c2b151dcffc87b19ea8a0c4f1 \n", "410691 NaN ee65532087132145daa6154fbae050ea \n", "410692 NaN 7e825dd352bc6a11ab81cb8068e325e6 \n", "410693 NaN fdb92627a48d6ba8fa817d60a83dbea8 \n", "410694 NaN e9dbaff4f7037a5b0efa11263584dfad \n", "\n", "[410695 rows x 7 columns]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "purchases" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }