diff --git a/TP_merge_tables_clean.ipynb b/TP_merge_tables_clean.ipynb new file mode 100644 index 0000000..66b5228 --- /dev/null +++ b/TP_merge_tables_clean.ipynb @@ -0,0 +1,1760 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8c56d518-3634-4492-b249-0d8ef33dd527", + "metadata": {}, + "source": [ + "## First steps : package importations, set up working environment and import data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dede42d9-1262-45f7-bd7a-586ae800092a", + "metadata": {}, + "outputs": [], + "source": [ + "# importations\n", + "\n", + "import os \n", + "import s3fs\n", + "import pandas as pd\n", + "import re\n", + "from datetime import datetime, timezone, timedelta\n", + "import math\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6ce34b58-b5ba-4b54-ba4d-fc82ef01b09c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1',\n", + " 'bdc2324-data/10',\n", + " 'bdc2324-data/101',\n", + " 'bdc2324-data/11',\n", + " 'bdc2324-data/12',\n", + " 'bdc2324-data/13',\n", + " 'bdc2324-data/14',\n", + " 'bdc2324-data/2',\n", + " 'bdc2324-data/3',\n", + " 'bdc2324-data/4',\n", + " 'bdc2324-data/5',\n", + " 'bdc2324-data/6',\n", + " 'bdc2324-data/7',\n", + " 'bdc2324-data/8',\n", + " 'bdc2324-data/9']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bucket for accessing the data\n", + "\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "\n", + "fs = s3fs.S3FileSystem(client_kwargs = {\"endpoint_url\" : S3_ENDPOINT_URL})\n", + "BUCKET = \"bdc2324-data\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8eb13dd3-53c7-4a70-94a4-846168473aa1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1/1campaign_stats.csv',\n", + " 'bdc2324-data/1/1campaigns.csv',\n", + " 'bdc2324-data/1/1categories.csv',\n", + " 'bdc2324-data/1/1countries.csv',\n", + " 'bdc2324-data/1/1currencies.csv',\n", + " 'bdc2324-data/1/1customer_target_mappings.csv',\n", + " 'bdc2324-data/1/1customersplus.csv',\n", + " 'bdc2324-data/1/1event_types.csv',\n", + " 'bdc2324-data/1/1events.csv',\n", + " 'bdc2324-data/1/1facilities.csv',\n", + " 'bdc2324-data/1/1link_stats.csv',\n", + " 'bdc2324-data/1/1pricing_formulas.csv',\n", + " 'bdc2324-data/1/1product_packs.csv',\n", + " 'bdc2324-data/1/1products.csv',\n", + " 'bdc2324-data/1/1products_groups.csv',\n", + " 'bdc2324-data/1/1purchases.csv',\n", + " 'bdc2324-data/1/1representation_category_capacities.csv',\n", + " 'bdc2324-data/1/1representations.csv',\n", + " 'bdc2324-data/1/1seasons.csv',\n", + " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", + " 'bdc2324-data/1/1suppliers.csv',\n", + " 'bdc2324-data/1/1tags.csv',\n", + " 'bdc2324-data/1/1target_types.csv',\n", + " 'bdc2324-data/1/1targets.csv',\n", + " 'bdc2324-data/1/1tickets.csv',\n", + " 'bdc2324-data/1/1type_of_categories.csv',\n", + " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", + " 'bdc2324-data/1/1type_ofs.csv']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "FILE_PATH_S3 = fs.ls(BUCKET)[0] # focus on the company number 1\n", + "files_path = fs.ls(FILE_PATH_S3)\n", + "files_path" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1ea66c4e-1307-4f19-836e-3104fba2ff41", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_487/2894332003.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(file_in)\n" + ] + } + ], + "source": [ + "# loop to create dataframes related to company 1\n", + "\n", + "client_number = files_path[0].split(\"/\")[1]\n", + "print(client_number)\n", + "df_prefix = \"df\" + str(client_number) + \"_\"\n", + "\n", + "for i in range(len(files_path)) :\n", + " current_path = files_path[i]\n", + " with fs.open(current_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in)\n", + " # the pattern of the name is df1xxx\n", + " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", + " globals()[nom_dataframe] = df" + ] + }, + { + "cell_type": "markdown", + "id": "13d70b2c-6580-4caf-b839-10f72b2e0b39", + "metadata": {}, + "source": [ + "## Target, target types and customer target mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4dbc7fea-ac3b-4348-83fb-dfb1a460f936", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idis_importnamecreated_atupdated_atidentifier
069Falsemanual_dynamic_filter2020-11-30 09:46:18.881030+01:002020-11-30 09:46:18.881030+01:00e0f4b8693184850fefd6d2a38f10584e
148Truemanual_structure2020-11-04 17:16:19.548275+01:002020-11-04 17:16:19.548275+01:00382bca214204a2d3462f5ec2728d5d1e
21Truemanual_import2020-10-14 18:37:40.521623+02:002020-10-14 18:37:40.521623+02:0012213df2ce68a624e4c0070521437bac
356Falsemanual_static_filter2020-11-04 18:08:37.233486+01:002020-11-04 18:08:37.233486+01:00fb27e81baa4debc6a4e1a8639c20e808
\n", + "
" + ], + "text/plain": [ + " id is_import name created_at \\\n", + "0 69 False manual_dynamic_filter 2020-11-30 09:46:18.881030+01:00 \n", + "1 48 True manual_structure 2020-11-04 17:16:19.548275+01:00 \n", + "2 1 True manual_import 2020-10-14 18:37:40.521623+02:00 \n", + "3 56 False manual_static_filter 2020-11-04 18:08:37.233486+01:00 \n", + "\n", + " updated_at identifier \n", + "0 2020-11-30 09:46:18.881030+01:00 e0f4b8693184850fefd6d2a38f10584e \n", + "1 2020-11-04 17:16:19.548275+01:00 382bca214204a2d3462f5ec2728d5d1e \n", + "2 2020-10-14 18:37:40.521623+02:00 12213df2ce68a624e4c0070521437bac \n", + "3 2020-11-04 18:08:37.233486+01:00 fb27e81baa4debc6a4e1a8639c20e808 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1. target types\n", + "df1_target_types.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0e9f5dcb-0dc3-4052-b866-e5c4cb954a1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtarget_type_idnamecreated_atupdated_at
021756DDCP PROMO Art contemporain - salle de chauffe...2021-01-04 15:00:05.401899+01:002021-03-02 18:38:19.025969+01:00
170156consentement optin scolaires2021-12-21 16:03:59.840785+01:002022-02-18 17:23:44.761388+01:00
213456DDCP Newsletter jeune public2020-11-10 09:43:19.667471+01:002021-03-02 18:38:19.052304+01:00
370056consentement optout scolaires2021-12-21 16:01:57.524946+01:002022-02-18 17:23:44.807776+01:00
496456DDCP achat billet nbr dep 190520212022-04-14 10:58:17.142834+02:002022-04-14 10:58:23.677264+02:00
\n", + "
" + ], + "text/plain": [ + " id target_type_id name \\\n", + "0 217 56 DDCP PROMO Art contemporain - salle de chauffe... \n", + "1 701 56 consentement optin scolaires \n", + "2 134 56 DDCP Newsletter jeune public \n", + "3 700 56 consentement optout scolaires \n", + "4 964 56 DDCP achat billet nbr dep 19052021 \n", + "\n", + " created_at updated_at \n", + "0 2021-01-04 15:00:05.401899+01:00 2021-03-02 18:38:19.025969+01:00 \n", + "1 2021-12-21 16:03:59.840785+01:00 2022-02-18 17:23:44.761388+01:00 \n", + "2 2020-11-10 09:43:19.667471+01:00 2021-03-02 18:38:19.052304+01:00 \n", + "3 2021-12-21 16:01:57.524946+01:00 2022-02-18 17:23:44.807776+01:00 \n", + "4 2022-04-14 10:58:17.142834+02:00 2022-04-14 10:58:23.677264+02:00 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 2. targets\n", + "df1_targets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c5c62302-370a-462f-bd79-eac31593f65c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idtarget_idcreated_atupdated_atnameextra_field
011848246454001302021-09-23 09:35:47.617275+02:002021-09-23 09:35:47.617275+02:00NaNNaN
111848256454003452021-09-23 09:35:47.668846+02:002021-09-23 09:35:47.668846+02:00NaNNaN
211848286454021262021-09-23 12:02:51.253269+02:002021-09-23 12:02:51.253269+02:00NaNNaN
311848296454031262021-09-23 12:20:47.394480+02:002021-09-23 12:20:47.394480+02:00NaNNaN
412957706473013462021-09-28 16:02:29.372608+02:002021-09-28 16:02:29.372608+02:00NaNNaN
\n", + "
" + ], + "text/plain": [ + " id customer_id target_id created_at \\\n", + "0 1184824 645400 130 2021-09-23 09:35:47.617275+02:00 \n", + "1 1184825 645400 345 2021-09-23 09:35:47.668846+02:00 \n", + "2 1184828 645402 126 2021-09-23 12:02:51.253269+02:00 \n", + "3 1184829 645403 126 2021-09-23 12:20:47.394480+02:00 \n", + "4 1295770 647301 346 2021-09-28 16:02:29.372608+02:00 \n", + "\n", + " updated_at name extra_field \n", + "0 2021-09-23 09:35:47.617275+02:00 NaN NaN \n", + "1 2021-09-23 09:35:47.668846+02:00 NaN NaN \n", + "2 2021-09-23 12:02:51.253269+02:00 NaN NaN \n", + "3 2021-09-23 12:20:47.394480+02:00 NaN NaN \n", + "4 2021-09-28 16:02:29.372608+02:00 NaN NaN " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 3. customer target mapping\n", + "\n", + "df1_customer_target_mappings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1a87cebf-c1dd-408d-a523-26633419da1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtarget_type_idnametarget_type_is_importtarget_type_name
021756DDCP PROMO Art contemporain - salle de chauffe...Falsemanual_static_filter
170156consentement optin scolairesFalsemanual_static_filter
213456DDCP Newsletter jeune publicFalsemanual_static_filter
370056consentement optout scolairesFalsemanual_static_filter
496456DDCP achat billet nbr dep 19052021Falsemanual_static_filter
\n", + "
" + ], + "text/plain": [ + " id target_type_id name \\\n", + "0 217 56 DDCP PROMO Art contemporain - salle de chauffe... \n", + "1 701 56 consentement optin scolaires \n", + "2 134 56 DDCP Newsletter jeune public \n", + "3 700 56 consentement optout scolaires \n", + "4 964 56 DDCP achat billet nbr dep 19052021 \n", + "\n", + " target_type_is_import target_type_name \n", + "0 False manual_static_filter \n", + "1 False manual_static_filter \n", + "2 False manual_static_filter \n", + "3 False manual_static_filter \n", + "4 False manual_static_filter " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 4.1. merge target with target type\n", + "\n", + "df1_targets_full = pd.merge(df1_targets[[\"id\", \"target_type_id\", \"name\"]], df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\"), left_on='target_type_id', right_on='target_type_id', how='left')\n", + "df1_targets_full.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d48c1fff-73c2-4e75-8799-da2b80694be7", + "metadata": {}, + "outputs": [], + "source": [ + "# 4.2. merge df1_customer_target_mappings with df1_targets_full\n", + "\n", + "# change the position of the column target type id\n", + "\n", + "# Spécifiez le nom de la colonne à déplacer et la colonne après laquelle vous souhaitez la placer\n", + "column_to_move = 'target_type_id'\n", + "\n", + "# Récupérez l'index de la colonne de référence\n", + "reference_index = df1_targets_full.columns.get_loc(\"target_type_name\")\n", + "\n", + "# Créez une copie de la colonne que vous voulez déplacer\n", + "column_copy = df1_targets_full[column_to_move].copy()\n", + "\n", + "# Supprimez la colonne d'origine\n", + "df1_targets_full = df1_targets_full.drop(column_to_move, axis=1)\n", + "\n", + "# Utilisez la méthode insert pour déplacer la colonne à la nouvelle position\n", + "df1_targets_full.insert(reference_index - 1, column_to_move, column_copy)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a874514a-c7dc-42d4-a440-dedd3a270e24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_idtarget_nametarget_type_is_importtarget_type_idtarget_type_name
0217DDCP PROMO Art contemporain - salle de chauffe...False56manual_static_filter
1701consentement optin scolairesFalse56manual_static_filter
2134DDCP Newsletter jeune publicFalse56manual_static_filter
3700consentement optout scolairesFalse56manual_static_filter
4964DDCP achat billet nbr dep 19052021False56manual_static_filter
\n", + "
" + ], + "text/plain": [ + " target_id target_name \\\n", + "0 217 DDCP PROMO Art contemporain - salle de chauffe... \n", + "1 701 consentement optin scolaires \n", + "2 134 DDCP Newsletter jeune public \n", + "3 700 consentement optout scolaires \n", + "4 964 DDCP achat billet nbr dep 19052021 \n", + "\n", + " target_type_is_import target_type_id target_type_name \n", + "0 False 56 manual_static_filter \n", + "1 False 56 manual_static_filter \n", + "2 False 56 manual_static_filter \n", + "3 False 56 manual_static_filter \n", + "4 False 56 manual_static_filter " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_targets_full = df1_targets_full.rename(columns=lambda x: 'target_' + x if not x.startswith('target_') else x)\n", + "df1_targets_full.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "0db0172a-5119-4b7f-97f8-36fc5c985205", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idtarget_idtarget_nametarget_type_is_importtarget_type_idtarget_type_name
01184824645400130DDCP PROMO Réseau livresFalse56manual_static_filter
11184825645400345Inscrits NL générale site webFalse56manual_static_filter
21184828645402126DDCP PROMO Art contemporainFalse56manual_static_filter
31184829645403126DDCP PROMO Art contemporainFalse56manual_static_filter
41295770647301346Votre première listeFalse56manual_static_filter
........................
7680192737545666983345Inscrits NL générale site webFalse56manual_static_filter
7680202737546666983346Votre première listeFalse56manual_static_filter
7680212737575666986346Votre première listeFalse56manual_static_filter
7680222737576666987345Inscrits NL générale site webFalse56manual_static_filter
7680232737577666987346Votre première listeFalse56manual_static_filter
\n", + "

768024 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " id customer_id target_id target_name \\\n", + "0 1184824 645400 130 DDCP PROMO Réseau livres \n", + "1 1184825 645400 345 Inscrits NL générale site web \n", + "2 1184828 645402 126 DDCP PROMO Art contemporain \n", + "3 1184829 645403 126 DDCP PROMO Art contemporain \n", + "4 1295770 647301 346 Votre première liste \n", + "... ... ... ... ... \n", + "768019 2737545 666983 345 Inscrits NL générale site web \n", + "768020 2737546 666983 346 Votre première liste \n", + "768021 2737575 666986 346 Votre première liste \n", + "768022 2737576 666987 345 Inscrits NL générale site web \n", + "768023 2737577 666987 346 Votre première liste \n", + "\n", + " target_type_is_import target_type_id target_type_name \n", + "0 False 56 manual_static_filter \n", + "1 False 56 manual_static_filter \n", + "2 False 56 manual_static_filter \n", + "3 False 56 manual_static_filter \n", + "4 False 56 manual_static_filter \n", + "... ... ... ... \n", + "768019 False 56 manual_static_filter \n", + "768020 False 56 manual_static_filter \n", + "768021 False 56 manual_static_filter \n", + "768022 False 56 manual_static_filter \n", + "768023 False 56 manual_static_filter \n", + "\n", + "[768024 rows x 7 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# finally, merge\n", + "\n", + "# pour df1_customer_target_mappings on enlève les colonnes name, extra_field, et updated_at (valeur égale à created_at)\n", + "# note : by making a left join on df1_customer_target_mappings, we suppress 2 targets that have no customer associated\n", + "\n", + "df1_customer_targets = pd.merge(df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]], \n", + " df1_targets_full, left_on='target_id', right_on='target_id', how='left')\n", + "df1_customer_targets" + ] + }, + { + "cell_type": "markdown", + "id": "52326267-c5ba-4e21-b8ab-4b4c62de75d1", + "metadata": {}, + "source": [ + "## Campaign stats, campaigns" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "06dca910-5c07-4ee1-bbf2-3b11b48ba1f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameservice_idcreated_atupdated_atprocess_idreport_urlcategoryto_be_syncedidentifiersent_at
01319613newsletter enseignants janvier 20227212022-01-14 16:06:42.586321+01:002022-02-03 14:17:27.112963+01:00NaNNaN0.0Falseaba3b6fd5d186d28e06ff97135cade7f2022-01-14 00:00:00+01:00
11319586lsf_janvier_20227172022-01-07 11:30:35.315895+01:002022-02-03 14:17:27.116171+01:00NaNNaN0.0False788d986905533aba051261497ecffcbb2022-01-07 00:00:00+01:00
21319282Invitation à déjeuner au Mucem | Vernissage « ...5912021-09-28 12:50:24.448752+02:002022-02-03 14:17:27.119582+01:00NaNNaN0.0False3493894fa4ea036cfc6433c3e2ee63b02021-09-28 00:00:00+02:00
31319283Vacances de la Toussaint - centres des loisirs5902021-09-28 18:01:04.692073+02:002022-02-03 14:17:27.124408+01:00NaNNaN0.0False08b255a5d42b89b0585260b6f2360bdd2021-09-28 00:00:00+02:00
41319636ddcp_promo_md_livemag7302022-01-27 18:00:41.053069+01:002022-02-03 14:17:27.127607+01:00NaNNaN0.0Falsed5cfead94f5350c12c322b5b664544c12022-01-27 00:00:00+01:00
\n", + "
" + ], + "text/plain": [ + " id name service_id \\\n", + "0 1319613 newsletter enseignants janvier 2022 721 \n", + "1 1319586 lsf_janvier_2022 717 \n", + "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n", + "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n", + "4 1319636 ddcp_promo_md_livemag 730 \n", + "\n", + " created_at updated_at \\\n", + "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n", + "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n", + "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n", + "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n", + "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n", + "\n", + " process_id report_url category to_be_synced \\\n", + "0 NaN NaN 0.0 False \n", + "1 NaN NaN 0.0 False \n", + "2 NaN NaN 0.0 False \n", + "3 NaN NaN 0.0 False \n", + "4 NaN NaN 0.0 False \n", + "\n", + " identifier sent_at \n", + "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n", + "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n", + "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n", + "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n", + "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1. campaigns\n", + "df1_campaigns.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "83eaa447-9144-41ed-9e26-f0f23799a8fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcampaign_idcustomer_idopened_atsent_atdelivered_atcreated_atupdated_at
01979358112597NaN2021-03-28 18:01:09+02:002021-03-28 18:24:18+02:002021-03-28 18:34:20.616136+02:002022-04-15 22:52:04.397693+02:00
11421158113666NaN2021-03-28 18:01:09+02:002021-03-28 18:21:02+02:002021-03-28 18:21:04.297213+02:002022-04-15 22:52:04.397693+02:00
21315058280561NaN2021-03-28 18:00:59+02:002021-03-28 18:08:45+02:002021-03-28 18:18:49.991042+02:002022-04-15 22:52:04.397693+02:00
37073581010072021-03-28 20:11:06+02:002021-03-28 18:00:59+02:002021-03-28 18:09:47+02:002021-03-28 18:09:50.915354+02:002022-04-15 22:52:04.397693+02:00
4517558103972NaN2021-03-28 18:01:06+02:002021-03-28 18:05:03+02:002021-03-28 18:05:08.507398+02:002022-04-15 22:52:04.397693+02:00
\n", + "
" + ], + "text/plain": [ + " id campaign_id customer_id opened_at \\\n", + "0 19793 58 112597 NaN \n", + "1 14211 58 113666 NaN \n", + "2 13150 58 280561 NaN \n", + "3 7073 58 101007 2021-03-28 20:11:06+02:00 \n", + "4 5175 58 103972 NaN \n", + "\n", + " sent_at delivered_at \\\n", + "0 2021-03-28 18:01:09+02:00 2021-03-28 18:24:18+02:00 \n", + "1 2021-03-28 18:01:09+02:00 2021-03-28 18:21:02+02:00 \n", + "2 2021-03-28 18:00:59+02:00 2021-03-28 18:08:45+02:00 \n", + "3 2021-03-28 18:00:59+02:00 2021-03-28 18:09:47+02:00 \n", + "4 2021-03-28 18:01:06+02:00 2021-03-28 18:05:03+02:00 \n", + "\n", + " created_at updated_at \n", + "0 2021-03-28 18:34:20.616136+02:00 2022-04-15 22:52:04.397693+02:00 \n", + "1 2021-03-28 18:21:04.297213+02:00 2022-04-15 22:52:04.397693+02:00 \n", + "2 2021-03-28 18:18:49.991042+02:00 2022-04-15 22:52:04.397693+02:00 \n", + "3 2021-03-28 18:09:50.915354+02:00 2022-04-15 22:52:04.397693+02:00 \n", + "4 2021-03-28 18:05:08.507398+02:00 2022-04-15 22:52:04.397693+02:00 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 2. campaigns stats\n", + "df1_campaign_stats.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "7f25eb1b-e7c8-4715-bc30-7ac29a7181ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcampaign_idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
01979358112597NaN2021-03-28 18:01:09+02:002021-03-28 18:24:18+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
11421158113666NaN2021-03-28 18:01:09+02:002021-03-28 18:21:02+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
21315058280561NaN2021-03-28 18:00:59+02:002021-03-28 18:08:45+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
37073581010072021-03-28 20:11:06+02:002021-03-28 18:00:59+02:002021-03-28 18:09:47+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
4517558103972NaN2021-03-28 18:01:06+02:002021-03-28 18:05:03+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
\n", + "
" + ], + "text/plain": [ + " id campaign_id customer_id opened_at \\\n", + "0 19793 58 112597 NaN \n", + "1 14211 58 113666 NaN \n", + "2 13150 58 280561 NaN \n", + "3 7073 58 101007 2021-03-28 20:11:06+02:00 \n", + "4 5175 58 103972 NaN \n", + "\n", + " sent_at delivered_at \\\n", + "0 2021-03-28 18:01:09+02:00 2021-03-28 18:24:18+02:00 \n", + "1 2021-03-28 18:01:09+02:00 2021-03-28 18:21:02+02:00 \n", + "2 2021-03-28 18:00:59+02:00 2021-03-28 18:08:45+02:00 \n", + "3 2021-03-28 18:00:59+02:00 2021-03-28 18:09:47+02:00 \n", + "4 2021-03-28 18:01:06+02:00 2021-03-28 18:05:03+02:00 \n", + "\n", + " campaign_name campaign_service_id \\\n", + "0 Le Mucem chez vous, gardons le lien #22 404 \n", + "1 Le Mucem chez vous, gardons le lien #22 404 \n", + "2 Le Mucem chez vous, gardons le lien #22 404 \n", + "3 Le Mucem chez vous, gardons le lien #22 404 \n", + "4 Le Mucem chez vous, gardons le lien #22 404 \n", + "\n", + " campaign_sent_at \n", + "0 2021-03-28 00:00:00+01:00 \n", + "1 2021-03-28 00:00:00+01:00 \n", + "2 2021-03-28 00:00:00+01:00 \n", + "3 2021-03-28 00:00:00+01:00 \n", + "4 2021-03-28 00:00:00+01:00 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 3. merge campaigns and campaigns stats\n", + "\n", + "df1_campaigns_full = pd.merge(df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]], \n", + " df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\"),\n", + " on = \"campaign_id\", how = \"left\")\n", + "df1_campaigns_full.head()" + ] + }, + { + "cell_type": "markdown", + "id": "87fc686a-4a80-40ab-9987-20d2774f3055", + "metadata": {}, + "source": [ + "## Link stats" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2f9df2d0-8a23-496b-8e92-617285f64530", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idclicked_atlink_idcustomer_idcreated_atupdated_at
012021-03-26 16:30:36+01:0012840332021-03-26 15:30:37.050161+01:002021-03-26 15:30:37.050161+01:00
122021-03-26 17:16:34+01:0021197682021-03-26 16:16:34.950871+01:002021-03-26 16:16:34.950871+01:00
22722021-03-28 20:03:32+02:00421131052021-03-28 18:03:32.736394+02:002021-03-28 18:03:32.736394+02:00
342021-03-26 17:43:19+01:0032722802021-03-26 16:43:19.338321+01:002021-03-26 16:43:19.338321+01:00
452021-03-26 17:46:00+01:0031050952021-03-26 16:46:00.502945+01:002021-03-26 16:46:00.502945+01:00
.....................
1510462435532023-11-09 16:34:27+01:00146669982023-11-09 15:34:29.425425+01:002023-11-09 15:34:29.425425+01:00
1510472435542023-11-09 16:34:35+01:00146709982023-11-09 15:34:37.505505+01:002023-11-09 15:34:37.505505+01:00
1510482435592023-11-09 16:51:15+01:0014686829232023-11-09 15:51:17.439518+01:002023-11-09 15:51:17.439518+01:00
1510492435612023-11-09 16:59:42+01:0014677829232023-11-09 15:59:44.030922+01:002023-11-09 15:59:44.030922+01:00
1510502435642023-11-09 17:16:41+01:001469112543552023-11-09 16:16:43.012932+01:002023-11-09 16:16:43.012932+01:00
\n", + "

151051 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id clicked_at link_id customer_id \\\n", + "0 1 2021-03-26 16:30:36+01:00 1 284033 \n", + "1 2 2021-03-26 17:16:34+01:00 2 119768 \n", + "2 272 2021-03-28 20:03:32+02:00 42 113105 \n", + "3 4 2021-03-26 17:43:19+01:00 3 272280 \n", + "4 5 2021-03-26 17:46:00+01:00 3 105095 \n", + "... ... ... ... ... \n", + "151046 243553 2023-11-09 16:34:27+01:00 14666 998 \n", + "151047 243554 2023-11-09 16:34:35+01:00 14670 998 \n", + "151048 243559 2023-11-09 16:51:15+01:00 14686 82923 \n", + "151049 243561 2023-11-09 16:59:42+01:00 14677 82923 \n", + "151050 243564 2023-11-09 17:16:41+01:00 14691 1254355 \n", + "\n", + " created_at updated_at \n", + "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n", + "1 2021-03-26 16:16:34.950871+01:00 2021-03-26 16:16:34.950871+01:00 \n", + "2 2021-03-28 18:03:32.736394+02:00 2021-03-28 18:03:32.736394+02:00 \n", + "3 2021-03-26 16:43:19.338321+01:00 2021-03-26 16:43:19.338321+01:00 \n", + "4 2021-03-26 16:46:00.502945+01:00 2021-03-26 16:46:00.502945+01:00 \n", + "... ... ... \n", + "151046 2023-11-09 15:34:29.425425+01:00 2023-11-09 15:34:29.425425+01:00 \n", + "151047 2023-11-09 15:34:37.505505+01:00 2023-11-09 15:34:37.505505+01:00 \n", + "151048 2023-11-09 15:51:17.439518+01:00 2023-11-09 15:51:17.439518+01:00 \n", + "151049 2023-11-09 15:59:44.030922+01:00 2023-11-09 15:59:44.030922+01:00 \n", + "151050 2023-11-09 16:16:43.012932+01:00 2023-11-09 16:16:43.012932+01:00 \n", + "\n", + "[151051 rows x 6 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_link_stats" + ] + }, + { + "cell_type": "markdown", + "id": "aad6fb14-9694-4c1e-9885-1ebe0f38afe3", + "metadata": {}, + "source": [ + "## Bonus : peut-on lier link stats et campaign ? Non, les dates à laquelle le client clique sur le lie/ouvre la campagne ne permettent pas de faire coincider link_id et campaign_id" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "8be7c974-72c9-4e31-a874-d7e5d2719fb3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idclicked_atlink_idcustomer_idcreated_atupdated_at
012021-03-26 16:30:36+01:0012840332021-03-26 15:30:37.050161+01:002021-03-26 15:30:37.050161+01:00
7526140182021-05-10 18:07:59+02:003122840332021-05-10 16:08:00.541322+02:002021-05-10 16:08:00.541322+02:00
968481334492021-03-25 08:42:22+01:0042840332022-04-15 22:51:01.994343+02:002022-04-15 22:51:01.994343+02:00
1157282075442022-08-23 10:33:04+02:00123652840332022-08-23 08:33:06.498908+02:002022-08-23 08:33:06.498908+02:00
\n", + "
" + ], + "text/plain": [ + " id clicked_at link_id customer_id \\\n", + "0 1 2021-03-26 16:30:36+01:00 1 284033 \n", + "7526 14018 2021-05-10 18:07:59+02:00 312 284033 \n", + "96848 133449 2021-03-25 08:42:22+01:00 4 284033 \n", + "115728 207544 2022-08-23 10:33:04+02:00 12365 284033 \n", + "\n", + " created_at updated_at \n", + "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n", + "7526 2021-05-10 16:08:00.541322+02:00 2021-05-10 16:08:00.541322+02:00 \n", + "96848 2022-04-15 22:51:01.994343+02:00 2022-04-15 22:51:01.994343+02:00 \n", + "115728 2022-08-23 08:33:06.498908+02:00 2022-08-23 08:33:06.498908+02:00 " + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_link_stats[df1_link_stats[\"customer_id\"] == 284033]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "902e9947-58e1-44f4-b634-1239b0e4df02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcampaign_idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
403064340363764284033NaN2021-03-21 18:01:22+01:002021-03-21 18:08:04+01:00Le Mucem chez vous, gardons le lien #213982021-03-21 00:00:00+01:00
\n", + "
" + ], + "text/plain": [ + " id campaign_id customer_id opened_at \\\n", + "4030643 4036376 4 284033 NaN \n", + "\n", + " sent_at delivered_at \\\n", + "4030643 2021-03-21 18:01:22+01:00 2021-03-21 18:08:04+01:00 \n", + "\n", + " campaign_name campaign_service_id \\\n", + "4030643 Le Mucem chez vous, gardons le lien #21 398 \n", + "\n", + " campaign_sent_at \n", + "4030643 2021-03-21 00:00:00+01:00 " + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_campaigns_full[ (df1_campaigns_full[\"customer_id\"] == 284033) & (df1_campaigns_full[\"campaign_id\"] == 4)]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}