# Exploratory study of variables : targets, campaign and link stats

## First steps : package importations, set up working environment and import data

In [253]:
# importations

import os 
import s3fs
import pandas as pd
import re
from datetime import datetime, timezone, timedelta
import math
import numpy as np

In [188]:
# bucket for accessing the data

S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]

fs = s3fs.S3FileSystem(client_kwargs = {"endpoint_url" : S3_ENDPOINT_URL})
BUCKET = "bdc2324-data"
fs.ls(BUCKET)

['bdc2324-data/1',
 'bdc2324-data/10',
 'bdc2324-data/101',
 'bdc2324-data/11',
 'bdc2324-data/12',
 'bdc2324-data/13',
 'bdc2324-data/14',
 'bdc2324-data/2',
 'bdc2324-data/3',
 'bdc2324-data/4',
 'bdc2324-data/5',
 'bdc2324-data/6',
 'bdc2324-data/7',
 'bdc2324-data/8',
 'bdc2324-data/9']

In [3]:
FILE_PATH_S3 = fs.ls(BUCKET)[0] # focus on the company number 1
files_path =  fs.ls(FILE_PATH_S3)
files_path

['bdc2324-data/1/1campaign_stats.csv',
 'bdc2324-data/1/1campaigns.csv',
 'bdc2324-data/1/1categories.csv',
 'bdc2324-data/1/1countries.csv',
 'bdc2324-data/1/1currencies.csv',
 'bdc2324-data/1/1customer_target_mappings.csv',
 'bdc2324-data/1/1customersplus.csv',
 'bdc2324-data/1/1event_types.csv',
 'bdc2324-data/1/1events.csv',
 'bdc2324-data/1/1facilities.csv',
 'bdc2324-data/1/1link_stats.csv',
 'bdc2324-data/1/1pricing_formulas.csv',
 'bdc2324-data/1/1product_packs.csv',
 'bdc2324-data/1/1products.csv',
 'bdc2324-data/1/1products_groups.csv',
 'bdc2324-data/1/1purchases.csv',
 'bdc2324-data/1/1representation_category_capacities.csv',
 'bdc2324-data/1/1representations.csv',
 'bdc2324-data/1/1seasons.csv',
 'bdc2324-data/1/1structure_tag_mappings.csv',
 'bdc2324-data/1/1suppliers.csv',
 'bdc2324-data/1/1tags.csv',
 'bdc2324-data/1/1target_types.csv',
 'bdc2324-data/1/1targets.csv',
 'bdc2324-data/1/1tickets.csv',
 'bdc2324-data/1/1type_of_categories.csv',
 'bdc2324-data/1/1type_of_pr

In [4]:
# loop to create dataframes related to company 1

client_number = files_path[0].split("/")[1]
df_prefix = "df" + str(client_number) + "_"

for i in range(len(files_path)) :
    current_path = files_path[i]
    with fs.open(current_path, mode="rb") as file_in:
        df = pd.read_csv(file_in)
        # the pattern of the name is df1xxx
        nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
        globals()[nom_dataframe] = df

  df = pd.read_csv(file_in)


## Target, target types and customer target mapping

In [5]:
# 1. target types
df1_target_types.dtypes

id             int64
is_import       bool
name          object
created_at    object
updated_at    object
identifier    object
dtype: object

In [6]:
type(df1_target_types["created_at"][0])

str

In [7]:
df1_target_types

Unnamed: 0,id,is_import,name,created_at,updated_at,identifier
0,69,False,manual_dynamic_filter,2020-11-30 09:46:18.881030+01:00,2020-11-30 09:46:18.881030+01:00,e0f4b8693184850fefd6d2a38f10584e
1,48,True,manual_structure,2020-11-04 17:16:19.548275+01:00,2020-11-04 17:16:19.548275+01:00,382bca214204a2d3462f5ec2728d5d1e
2,1,True,manual_import,2020-10-14 18:37:40.521623+02:00,2020-10-14 18:37:40.521623+02:00,12213df2ce68a624e4c0070521437bac
3,56,False,manual_static_filter,2020-11-04 18:08:37.233486+01:00,2020-11-04 18:08:37.233486+01:00,fb27e81baa4debc6a4e1a8639c20e808


In [8]:
# 2. targets

df1_targets.head()

Unnamed: 0,id,target_type_id,name,created_at,updated_at
0,217,56,DDCP PROMO Art contemporain - salle de chauffe...,2021-01-04 15:00:05.401899+01:00,2021-03-02 18:38:19.025969+01:00
1,701,56,consentement optin scolaires,2021-12-21 16:03:59.840785+01:00,2022-02-18 17:23:44.761388+01:00
2,134,56,DDCP Newsletter jeune public,2020-11-10 09:43:19.667471+01:00,2021-03-02 18:38:19.052304+01:00
3,700,56,consentement optout scolaires,2021-12-21 16:01:57.524946+01:00,2022-02-18 17:23:44.807776+01:00
4,964,56,DDCP achat billet nbr dep 19052021,2022-04-14 10:58:17.142834+02:00,2022-04-14 10:58:23.677264+02:00


In [30]:
df1_targets.dtypes

id                 int64
target_type_id     int64
name              object
created_at        object
updated_at        object
dtype: object

In [33]:
type(df1_targets["created_at"][0])

str

In [35]:
# valeurs manquantes

df1_targets.isna().sum()

id                0
target_type_id    0
name              0
created_at        0
updated_at        0
dtype: int64

In [36]:
df1_targets["name"].nunique()

287

In [37]:
df1_targets

Unnamed: 0,id,target_type_id,name,created_at,updated_at
0,217,56,DDCP PROMO Art contemporain - salle de chauffe...,2021-01-04 15:00:05.401899+01:00,2021-03-02 18:38:19.025969+01:00
1,701,56,consentement optin scolaires,2021-12-21 16:03:59.840785+01:00,2022-02-18 17:23:44.761388+01:00
2,134,56,DDCP Newsletter jeune public,2020-11-10 09:43:19.667471+01:00,2021-03-02 18:38:19.052304+01:00
3,700,56,consentement optout scolaires,2021-12-21 16:01:57.524946+01:00,2022-02-18 17:23:44.807776+01:00
4,964,56,DDCP achat billet nbr dep 19052021,2022-04-14 10:58:17.142834+02:00,2022-04-14 10:58:23.677264+02:00
...,...,...,...,...,...
282,1811,1,ddcp_promo_ribambelle_2022_mapado_naikko_opt in,2022-11-30 15:57:05.681956+01:00,2022-11-30 16:00:32.649210+01:00
283,2006,1,cp 14 mars,2023-03-03 18:07:00.223750+01:00,2023-03-03 18:15:01.390970+01:00
284,2193,1,ddcp fichier musique 2,2023-04-14 14:33:53.628142+02:00,2023-04-14 15:00:35.608210+02:00
285,2429,1,import_mucem,2023-06-26 18:32:40.146757+02:00,2023-06-26 18:45:02.614668+02:00


In [13]:
# 3. customer target mapping

df1_customer_target_mappings.head()

Unnamed: 0,id,customer_id,target_id,created_at,updated_at,name,extra_field
0,1184824,645400,130,2021-09-23 09:35:47.617275+02:00,2021-09-23 09:35:47.617275+02:00,,
1,1184825,645400,345,2021-09-23 09:35:47.668846+02:00,2021-09-23 09:35:47.668846+02:00,,
2,1184828,645402,126,2021-09-23 12:02:51.253269+02:00,2021-09-23 12:02:51.253269+02:00,,
3,1184829,645403,126,2021-09-23 12:20:47.394480+02:00,2021-09-23 12:20:47.394480+02:00,,
4,1295770,647301,346,2021-09-28 16:02:29.372608+02:00,2021-09-28 16:02:29.372608+02:00,,


In [19]:
df1_customer_target_mappings.isna().sum()/df1_customer_target_mappings.shape[0]

id             0.000000
customer_id    0.000000
target_id      0.000000
created_at     0.000022
updated_at     0.000022
name           1.000000
extra_field    1.000000
dtype: float64

In [17]:
df1_customer_target_mappings["id"].nunique()
# df1_customer_target_mappings.shape[0]

768024

In [28]:
# les couples customer_id / target_id sont-ils uniques ?
df1_customer_target_mappings.duplicated(subset = ["customer_id", "target_id"]).sum() # aucun doublon

0

In [30]:
# 4.1. merge target with target type

df1_target_types[["id","is_import","name","identifier"]].add_prefix("target_type_")

Unnamed: 0,target_type_id,target_type_is_import,target_type_name,target_type_identifier
0,69,False,manual_dynamic_filter,e0f4b8693184850fefd6d2a38f10584e
1,48,True,manual_structure,382bca214204a2d3462f5ec2728d5d1e
2,1,True,manual_import,12213df2ce68a624e4c0070521437bac
3,56,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808


In [94]:
# merge

df1_targets_full = pd.merge(df1_targets, df1_target_types[["id","is_import","name","identifier"]].add_prefix("target_type_"), left_on='target_type_id', right_on='target_type_id', how='left')
df1_targets_full

Unnamed: 0,id,target_type_id,name,created_at,updated_at,target_type_is_import,target_type_name,target_type_identifier
0,217,56,DDCP PROMO Art contemporain - salle de chauffe...,2021-01-04 15:00:05.401899+01:00,2021-03-02 18:38:19.025969+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
1,701,56,consentement optin scolaires,2021-12-21 16:03:59.840785+01:00,2022-02-18 17:23:44.761388+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
2,134,56,DDCP Newsletter jeune public,2020-11-10 09:43:19.667471+01:00,2021-03-02 18:38:19.052304+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
3,700,56,consentement optout scolaires,2021-12-21 16:01:57.524946+01:00,2022-02-18 17:23:44.807776+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
4,964,56,DDCP achat billet nbr dep 19052021,2022-04-14 10:58:17.142834+02:00,2022-04-14 10:58:23.677264+02:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
...,...,...,...,...,...,...,...,...
282,1811,1,ddcp_promo_ribambelle_2022_mapado_naikko_opt in,2022-11-30 15:57:05.681956+01:00,2022-11-30 16:00:32.649210+01:00,True,manual_import,12213df2ce68a624e4c0070521437bac
283,2006,1,cp 14 mars,2023-03-03 18:07:00.223750+01:00,2023-03-03 18:15:01.390970+01:00,True,manual_import,12213df2ce68a624e4c0070521437bac
284,2193,1,ddcp fichier musique 2,2023-04-14 14:33:53.628142+02:00,2023-04-14 15:00:35.608210+02:00,True,manual_import,12213df2ce68a624e4c0070521437bac
285,2429,1,import_mucem,2023-06-26 18:32:40.146757+02:00,2023-06-26 18:45:02.614668+02:00,True,manual_import,12213df2ce68a624e4c0070521437bac


In [32]:
# 4.2. merge df1_customer_target_mappings with df1_targets_full

df1_customer_target_mappings.head()

Unnamed: 0,id,customer_id,target_id,created_at,updated_at,name,extra_field
0,1184824,645400,130,2021-09-23 09:35:47.617275+02:00,2021-09-23 09:35:47.617275+02:00,,
1,1184825,645400,345,2021-09-23 09:35:47.668846+02:00,2021-09-23 09:35:47.668846+02:00,,
2,1184828,645402,126,2021-09-23 12:02:51.253269+02:00,2021-09-23 12:02:51.253269+02:00,,
3,1184829,645403,126,2021-09-23 12:20:47.394480+02:00,2021-09-23 12:20:47.394480+02:00,,
4,1295770,647301,346,2021-09-28 16:02:29.372608+02:00,2021-09-28 16:02:29.372608+02:00,,


In [42]:
# Q : les dates de création et de mise à jour de la table customer target mapping sont elles égales ??

# 17 observations for which creation date != update date, ms ce sont que des Nan, OK !
(df1_customer_target_mappings["created_at"] != df1_customer_target_mappings["updated_at"]).sum() 

17

In [43]:
df1_customer_target_mappings[df1_customer_target_mappings["created_at"] != df1_customer_target_mappings["updated_at"]]

Unnamed: 0,id,customer_id,target_id,created_at,updated_at,name,extra_field
605484,1691570,661701,264,,,,
654549,1832071,651594,264,,,,
654550,1832072,663061,264,,,,
654551,1832073,663114,264,,,,
655162,1949466,663865,264,,,,
754038,2154438,664300,264,,,,
760929,2282079,665557,264,,,,
760930,2282080,665563,264,,,,
761787,2675293,661492,264,,,,
761798,2721237,665931,264,,,,


In [44]:
# ces données manquantes concernent le target avec id 264, mais les autres valeurs pr ce même target sont bien renseignées
df1_customer_target_mappings[df1_customer_target_mappings["target_id"]==264]

Unnamed: 0,id,customer_id,target_id,created_at,updated_at,name,extra_field
140,3416265,1751,264,2022-01-28 20:00:16.448920+01:00,2022-01-28 20:00:16.448920+01:00,,
149,3416274,2213,264,2022-01-28 20:30:17.323634+01:00,2022-01-28 20:30:17.323634+01:00,,
1120,4292054,1156059,264,2022-09-29 07:00:43.003440+02:00,2022-09-29 07:00:43.003440+02:00,,
1121,4292055,1156063,264,2022-09-29 07:00:43.003440+02:00,2022-09-29 07:00:43.003440+02:00,,
4006,4428048,34916,264,2023-03-14 07:01:27.868349+01:00,2023-03-14 07:01:27.868349+01:00,,
...,...,...,...,...,...,...,...
761801,2721240,665956,264,,,,
767918,2736960,666466,264,,,,
767919,2736961,666468,264,,,,
767968,2737357,666824,264,,,,


In [71]:
# Q : les dates de creation / update sont elles-uniques selon le client ou selon la target ?

df1_customer_target_mappings[df1_customer_target_mappings["target_id"]==217]["updated_at"].max()

'2021-10-28 11:30:42.717180+02:00'

In [65]:
df1_targets_full[df1_targets_full["id"]==217]

Unnamed: 0,id,target_type_id,name,created_at,updated_at,target_type_is_import,target_type_name,target_type_identifier
0,217,56,DDCP PROMO Art contemporain - salle de chauffe...,2021-01-04 15:00:05.401899+01:00,2021-03-02 18:38:19.025969+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808


In [69]:
df1_targets_full.head()

Unnamed: 0,id,target_type_id,name,created_at,updated_at,target_type_is_import,target_type_name,target_type_identifier
0,217,56,DDCP PROMO Art contemporain - salle de chauffe...,2021-01-04 15:00:05.401899+01:00,2021-03-02 18:38:19.025969+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
1,701,56,consentement optin scolaires,2021-12-21 16:03:59.840785+01:00,2022-02-18 17:23:44.761388+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
2,134,56,DDCP Newsletter jeune public,2020-11-10 09:43:19.667471+01:00,2021-03-02 18:38:19.052304+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
3,700,56,consentement optout scolaires,2021-12-21 16:01:57.524946+01:00,2022-02-18 17:23:44.807776+01:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
4,964,56,DDCP achat billet nbr dep 19052021,2022-04-14 10:58:17.142834+02:00,2022-04-14 10:58:23.677264+02:00,False,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808


In [74]:
df1_customer_target_mappings

Unnamed: 0,id,customer_id,target_id,created_at,updated_at,name,extra_field
0,1184824,645400,130,2021-09-23 09:35:47.617275+02:00,2021-09-23 09:35:47.617275+02:00,,
1,1184825,645400,345,2021-09-23 09:35:47.668846+02:00,2021-09-23 09:35:47.668846+02:00,,
2,1184828,645402,126,2021-09-23 12:02:51.253269+02:00,2021-09-23 12:02:51.253269+02:00,,
3,1184829,645403,126,2021-09-23 12:20:47.394480+02:00,2021-09-23 12:20:47.394480+02:00,,
4,1295770,647301,346,2021-09-28 16:02:29.372608+02:00,2021-09-28 16:02:29.372608+02:00,,
...,...,...,...,...,...,...,...
768019,2737545,666983,345,2021-12-14 14:48:05.456842+01:00,2021-12-14 14:48:05.456842+01:00,,
768020,2737546,666983,346,2021-12-14 14:48:05.465830+01:00,2021-12-14 14:48:05.465830+01:00,,
768021,2737575,666986,346,2021-12-14 23:15:42.757832+01:00,2021-12-14 23:15:42.757832+01:00,,
768022,2737576,666987,345,2021-12-15 00:14:59.018215+01:00,2021-12-15 00:14:59.018215+01:00,,


In [104]:
# change the position of the column target type id

# Spécifiez le nom de la colonne à déplacer et la colonne après laquelle vous souhaitez la placer
column_to_move = 'target_type_id'

# Récupérez l'index de la colonne de référence
reference_index = df1_targets_full.columns.get_loc("target_type_name")

# Créez une copie de la colonne que vous voulez déplacer
column_copy = df1_targets_full[column_to_move].copy()

# Supprimez la colonne d'origine
df1_targets_full = df1_targets_full.drop(column_to_move, axis=1)

# Utilisez la méthode insert pour déplacer la colonne à la nouvelle position
df1_targets_full.insert(reference_index - 1, column_to_move, column_copy)

In [109]:
df1_targets_full = df1_targets_full.rename(columns=lambda x: 'target_' + x if not x.startswith('target_') else x)
df1_targets_full.head()

Unnamed: 0,target_id,target_name,target_created_at,target_updated_at,target_type_is_import,target_type_id,target_type_name,target_type_identifier
0,217,DDCP PROMO Art contemporain - salle de chauffe...,2021-01-04 15:00:05.401899+01:00,2021-03-02 18:38:19.025969+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
1,701,consentement optin scolaires,2021-12-21 16:03:59.840785+01:00,2022-02-18 17:23:44.761388+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
2,134,DDCP Newsletter jeune public,2020-11-10 09:43:19.667471+01:00,2021-03-02 18:38:19.052304+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
3,700,consentement optout scolaires,2021-12-21 16:01:57.524946+01:00,2022-02-18 17:23:44.807776+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
4,964,DDCP achat billet nbr dep 19052021,2022-04-14 10:58:17.142834+02:00,2022-04-14 10:58:23.677264+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808


In [110]:
df1_customer_target_mappings[["id", "customer_id", "target_id", "created_at"]].head()

Unnamed: 0,id,customer_id,target_id,created_at
0,1184824,645400,130,2021-09-23 09:35:47.617275+02:00
1,1184825,645400,345,2021-09-23 09:35:47.668846+02:00
2,1184828,645402,126,2021-09-23 12:02:51.253269+02:00
3,1184829,645403,126,2021-09-23 12:20:47.394480+02:00
4,1295770,647301,346,2021-09-28 16:02:29.372608+02:00


In [111]:
# finally, merge

# pour df1_customer_target_mappings on enlève les colonnes name, extra_field, et updated_at (valeur égale à created_at)
# note : by making a left join on df1_customer_target_mappings, we suppress 2 targets that have no customer associated

df1_customer_targets = pd.merge(df1_customer_target_mappings[["id", "customer_id", "target_id", "created_at"]], 
                                df1_targets_full, left_on='target_id', right_on='target_id', how='left')
df1_customer_targets

Unnamed: 0,id,customer_id,target_id,created_at,target_name,target_created_at,target_updated_at,target_type_is_import,target_type_id,target_type_name,target_type_identifier
0,1184824,645400,130,2021-09-23 09:35:47.617275+02:00,DDCP PROMO Réseau livres,2020-11-04 18:40:49.500866+01:00,2021-03-02 18:38:19.084287+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
1,1184825,645400,345,2021-09-23 09:35:47.668846+02:00,Inscrits NL générale site web,2021-04-16 17:17:26.069199+02:00,2021-04-16 17:17:26.069199+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
2,1184828,645402,126,2021-09-23 12:02:51.253269+02:00,DDCP PROMO Art contemporain,2020-11-04 18:38:53.016572+01:00,2021-04-16 17:17:25.850107+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
3,1184829,645403,126,2021-09-23 12:20:47.394480+02:00,DDCP PROMO Art contemporain,2020-11-04 18:38:53.016572+01:00,2021-04-16 17:17:25.850107+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
4,1295770,647301,346,2021-09-28 16:02:29.372608+02:00,Votre première liste,2021-04-16 17:17:26.080378+02:00,2021-04-16 17:17:26.080378+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
...,...,...,...,...,...,...,...,...,...,...,...
768019,2737545,666983,345,2021-12-14 14:48:05.456842+01:00,Inscrits NL générale site web,2021-04-16 17:17:26.069199+02:00,2021-04-16 17:17:26.069199+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
768020,2737546,666983,346,2021-12-14 14:48:05.465830+01:00,Votre première liste,2021-04-16 17:17:26.080378+02:00,2021-04-16 17:17:26.080378+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
768021,2737575,666986,346,2021-12-14 23:15:42.757832+01:00,Votre première liste,2021-04-16 17:17:26.080378+02:00,2021-04-16 17:17:26.080378+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
768022,2737576,666987,345,2021-12-15 00:14:59.018215+01:00,Inscrits NL générale site web,2021-04-16 17:17:26.069199+02:00,2021-04-16 17:17:26.069199+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808


In [138]:
# rq : on dirait que la date de création des targets est à peine inférieure à la date minimum de création des targets des customers 
# idée : les targets sont créées puis envoyées aux clients, d'où un léger délai 
# mais question substiste : pourquoi les clients ne reçoivent-ils pas la target en même temps ? 

# vérifions que la date de création de la target est tjrs inférieure à la date de création minimum pour tous les clients ayant reçu la target

# first step : convert strings into dates

df1_customer_targets["created_at"] = df1_customer_targets["created_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S.%f%z") if pd.notna(x) else pd.NaT)
df1_customer_targets["target_created_at"] = df1_customer_targets["target_created_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S.%f%z") if pd.notna(x) else pd.NaT)
df1_customer_targets["target_updated_at"] = df1_customer_targets["target_updated_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S.%f%z") if pd.notna(x) else pd.NaT)



Unnamed: 0,id,customer_id,target_id,created_at,target_name,target_created_at,target_updated_at,target_type_is_import,target_type_id,target_type_name,target_type_identifier
0,1184824,645400,130,2021-09-23 09:35:47.617275+02:00,DDCP PROMO Réseau livres,2020-11-04 18:40:49.500866+01:00,2021-03-02 18:38:19.084287+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
1,1184825,645400,345,2021-09-23 09:35:47.668846+02:00,Inscrits NL générale site web,2021-04-16 17:17:26.069199+02:00,2021-04-16 17:17:26.069199+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
2,1184828,645402,126,2021-09-23 12:02:51.253269+02:00,DDCP PROMO Art contemporain,2020-11-04 18:38:53.016572+01:00,2021-04-16 17:17:25.850107+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
3,1184829,645403,126,2021-09-23 12:20:47.394480+02:00,DDCP PROMO Art contemporain,2020-11-04 18:38:53.016572+01:00,2021-04-16 17:17:25.850107+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
4,1295770,647301,346,2021-09-28 16:02:29.372608+02:00,Votre première liste,2021-04-16 17:17:26.080378+02:00,2021-04-16 17:17:26.080378+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
...,...,...,...,...,...,...,...,...,...,...,...
768019,2737545,666983,345,2021-12-14 14:48:05.456842+01:00,Inscrits NL générale site web,2021-04-16 17:17:26.069199+02:00,2021-04-16 17:17:26.069199+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
768020,2737546,666983,346,2021-12-14 14:48:05.465830+01:00,Votre première liste,2021-04-16 17:17:26.080378+02:00,2021-04-16 17:17:26.080378+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
768021,2737575,666986,346,2021-12-14 23:15:42.757832+01:00,Votre première liste,2021-04-16 17:17:26.080378+02:00,2021-04-16 17:17:26.080378+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808
768022,2737576,666987,345,2021-12-15 00:14:59.018215+01:00,Inscrits NL générale site web,2021-04-16 17:17:26.069199+02:00,2021-04-16 17:17:26.069199+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808


In [144]:
# second step : compute delay and minimum by target

df1_customer_targets["creation_delay"] = df1_customer_targets["created_at"] -df1_customer_targets["target_created_at"]


df1_customer_targets.groupby("target_id")["creation_delay"].min()

target_id
116     0 days 00:00:00.949028
117     0 days 00:00:00.037337
119     0 days 00:00:00.024423
120     0 days 00:00:00.058732
122     0 days 00:00:00.027283
                 ...          
2779    0 days 00:00:19.087958
2788    0 days 00:01:36.372927
2825    0 days 00:00:00.028771
2830    0 days 00:00:01.587058
2833    0 days 00:00:00.031071
Name: creation_delay, Length: 283, dtype: object

In [148]:
print(df1_customer_targets.groupby("target_id")["creation_delay"].min().min())
print((df1_customer_targets.groupby("target_id")["creation_delay"].min()).max())

0 days 00:00:00.009293
686 days 23:14:10.435866


In [153]:
# glt, le délai création de la target - création pour le premier client est très court, envoi quasi instantanné
# mais parfois, le délai est très long, plus d'une année pour les cas extrêmes

min_target_delay = df1_customer_targets.groupby("target_id")["creation_delay"].min()
min_target_delay[min_target_delay > timedelta(days=1)]

target_id
335     285 days 22:56:30.356536
339      86 days 21:34:19.282253
469       7 days 07:24:03.446563
490       3 days 16:28:38.068677
502       7 days 20:15:19.326651
515       1 days 22:49:33.761856
517      76 days 00:41:25.366394
528      26 days 06:17:44.689111
529       6 days 02:41:29.617761
530       1 days 04:34:33.843116
642     219 days 16:50:10.816034
695     668 days 03:31:22.896313
697      58 days 20:26:26.744823
699     686 days 23:14:10.435866
786     625 days 14:47:48.797084
1747     14 days 04:08:24.295840
2094    239 days 15:13:18.681637
2321    167 days 21:19:37.490219
Name: creation_delay, dtype: object

In [155]:
df1_customer_targets.groupby("target_type_id")["creation_delay"].min() # les target de type 1 ont un plus grd délai

target_type_id
1     0 days 00:00:06.490151
56    0 days 00:00:00.009293
69    0 days 00:00:00.032269
Name: creation_delay, dtype: object

In [159]:
print(df1_customer_targets["target_type_id"].unique())
print(df1_targets["target_type_id"].unique()) # rq : slt 3 types de target sur les 4 sont dans la table

[56 69  1]
[56 69  1]


In [165]:
# final visu : nice table for targets

# pour la suite, on peut supprimer la colonne creation delay, 
# était juste utile pour vérifier que la date de création était postérieure à la date de création de la target

df1_customer_targets.head(10)

Unnamed: 0,id,customer_id,target_id,created_at,target_name,target_created_at,target_updated_at,target_type_is_import,target_type_id,target_type_name,target_type_identifier,creation_delay
0,1184824,645400,130,2021-09-23 09:35:47.617275+02:00,DDCP PROMO Réseau livres,2020-11-04 18:40:49.500866+01:00,2021-03-02 18:38:19.084287+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"322 days, 13:54:58.116409"
1,1184825,645400,345,2021-09-23 09:35:47.668846+02:00,Inscrits NL générale site web,2021-04-16 17:17:26.069199+02:00,2021-04-16 17:17:26.069199+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"159 days, 16:18:21.599647"
2,1184828,645402,126,2021-09-23 12:02:51.253269+02:00,DDCP PROMO Art contemporain,2020-11-04 18:38:53.016572+01:00,2021-04-16 17:17:25.850107+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"322 days, 16:23:58.236697"
3,1184829,645403,126,2021-09-23 12:20:47.394480+02:00,DDCP PROMO Art contemporain,2020-11-04 18:38:53.016572+01:00,2021-04-16 17:17:25.850107+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"322 days, 16:41:54.377908"
4,1295770,647301,346,2021-09-28 16:02:29.372608+02:00,Votre première liste,2021-04-16 17:17:26.080378+02:00,2021-04-16 17:17:26.080378+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"164 days, 22:45:03.292230"
5,1184833,645627,398,2021-09-24 18:16:33.432760+02:00,DDCP PROMO MD participants ateliers yoga,2021-05-26 10:54:12.232999+02:00,2021-05-26 10:54:22.378253+02:00,False,69,manual_dynamic_filter,e0f4b8693184850fefd6d2a38f10584e,"121 days, 7:22:21.199761"
6,4452818,1208736,631,2023-05-06 03:29:43.875970+02:00,consentement optin b2b,2021-11-30 10:03:37.430645+01:00,2022-02-18 17:21:30.653027+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"521 days, 16:26:06.445325"
7,4291702,1155845,502,2022-09-28 12:55:36.843316+02:00,Automation_parrainage_newsletter_générale,2021-08-10 15:25:56.142538+02:00,2021-08-10 15:26:06.275964+02:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"413 days, 21:29:40.700778"
8,4096406,1121651,469,2022-07-31 11:45:19.694236+02:00,RI Newsletter Alexandrie (inscriptions formula...,2021-07-08 11:31:10.246495+02:00,2022-01-26 12:14:17.941253+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"388 days, 0:14:09.447741"
9,4452824,1208742,631,2023-05-06 03:29:43.901323+02:00,consentement optin b2b,2021-11-30 10:03:37.430645+01:00,2022-02-18 17:21:30.653027+01:00,False,56,manual_static_filter,fb27e81baa4debc6a4e1a8639c20e808,"521 days, 16:26:06.470678"


## Campaign stats, campaigns

In [189]:
# 1. campaigns

df1_campaigns.head()

Unnamed: 0,id,name,service_id,created_at,updated_at,process_id,report_url,category,to_be_synced,identifier,sent_at
0,1319613,newsletter enseignants janvier 2022,721,2022-01-14 16:06:42.586321+01:00,2022-02-03 14:17:27.112963+01:00,,,0.0,False,aba3b6fd5d186d28e06ff97135cade7f,2022-01-14 00:00:00+01:00
1,1319586,lsf_janvier_2022,717,2022-01-07 11:30:35.315895+01:00,2022-02-03 14:17:27.116171+01:00,,,0.0,False,788d986905533aba051261497ecffcbb,2022-01-07 00:00:00+01:00
2,1319282,Invitation à déjeuner au Mucem | Vernissage « ...,591,2021-09-28 12:50:24.448752+02:00,2022-02-03 14:17:27.119582+01:00,,,0.0,False,3493894fa4ea036cfc6433c3e2ee63b0,2021-09-28 00:00:00+02:00
3,1319283,Vacances de la Toussaint - centres des loisirs,590,2021-09-28 18:01:04.692073+02:00,2022-02-03 14:17:27.124408+01:00,,,0.0,False,08b255a5d42b89b0585260b6f2360bdd,2021-09-28 00:00:00+02:00
4,1319636,ddcp_promo_md_livemag,730,2022-01-27 18:00:41.053069+01:00,2022-02-03 14:17:27.127607+01:00,,,0.0,False,d5cfead94f5350c12c322b5b664544c1,2022-01-27 00:00:00+01:00


In [171]:
# part de Nan pour chaque variable

df1_campaigns.isna().sum() / df1_campaigns.shape[0]

id              0.000000
name            0.000000
service_id      0.000000
created_at      0.000000
updated_at      0.000000
process_id      1.000000
report_url      1.000000
category        0.002090
to_be_synced    0.000000
identifier      0.000000
sent_at         0.003135
dtype: float64

In [185]:
df1_campaigns.dtypes

id                int64
name             object
service_id        int64
created_at       object
updated_at       object
process_id      float64
report_url      float64
category        float64
to_be_synced       bool
identifier       object
sent_at          object
dtype: object

In [186]:
type(df1_campaigns["identifier"][0])

str

In [187]:
# category

df1_campaigns["category"].isna()

array([ 0., nan])

In [191]:
# identifier

print(df1_campaigns["identifier"].nunique())
print(df1_campaigns.shape[0]) # identifier is unique

957
957


In [194]:
# service id

print(df1_campaigns.nunique()) # on a un identifiant de service par campagne, mais pas un nom unique

id              957
name            855
service_id      957
created_at      957
updated_at      957
process_id        0
report_url        0
category          1
to_be_synced      2
identifier      957
sent_at         737
dtype: int64


In [211]:
# name

df1_campaigns[df1_campaigns.duplicated(subset = ["name"], keep=False)].sort_values("name").head(20)

Unnamed: 0,id,name,service_id,created_at,updated_at,process_id,report_url,category,to_be_synced,identifier,sent_at
777,1319239,"""L'Orient sonore"" au Mucem à partir du 22 juillet",184,2021-09-24 11:56:09.277085+02:00,2021-09-24 11:56:09.277085+02:00,,,0.0,False,6cdd60ea0045eb7a6ec44c54d29ed402,2020-07-15 00:00:00+02:00
778,1319240,"""L'Orient sonore"" au Mucem à partir du 22 juillet",181,2021-09-24 11:56:09.284647+02:00,2021-09-24 11:56:09.284647+02:00,,,0.0,False,fc221309746013ac554571fbd180e1c8,2020-07-09 00:00:00+02:00
255,1320926,Alexandrie NL2,1116,2023-01-31 11:08:55.915268+01:00,2023-01-31 11:08:56.286044+01:00,,,0.0,False,dd77279f7d325eec933f05b1672f6a1f,2023-01-31 12:08:54+01:00
161,1320910,Alexandrie NL2,1077,2023-01-24 09:01:00.250855+01:00,2023-01-24 09:01:00.271292+01:00,,,0.0,False,062ddb6c727310e76b6200b7c71f63b5,2023-01-24 10:00:58+01:00
241,1320574,Alexandrie NL2,731,2022-10-11 07:00:50.971513+02:00,2022-12-02 17:51:21.670983+01:00,,,0.0,False,59c33016884a62116be975a9bb8257e3,2022-10-11 00:00:00+02:00
317,1320972,Centres_loisirs _vacances de février,1124,2023-02-08 12:01:16.732961+01:00,2023-02-08 12:01:16.808008+01:00,,,0.0,False,c7635bfd99248a2cdef8249ef7bfbef4,2023-02-08 13:01:15+01:00
166,1320954,Centres_loisirs _vacances de février,1110,2023-02-01 09:30:41.267232+01:00,2023-02-01 09:30:41.354117+01:00,,,0.0,False,2cbca44843a864533ec05b321ae1f9d1,2023-02-01 10:30:40+01:00
672,148,Champ social décembre 2020,283,2021-04-03 18:24:42.186026+02:00,2021-09-24 11:56:08.182818+02:00,,,0.0,False,0f49c89d1e7298bb9930789c8ed59d48,2020-12-03 00:00:00+01:00
569,72,Champ social décembre 2020,284,2021-03-29 15:41:53.631952+02:00,2021-09-24 11:56:07.748770+02:00,,,0.0,False,46ba9f2a6976570b0353203ec4474217,2020-12-04 00:00:00+01:00
175,1319881,Champ social mars 2022,833,2022-04-25 10:00:26.029871+02:00,2022-12-02 17:51:22.319899+01:00,,,0.0,False,013a006f03dbc5392effeb8f18fda755,2022-04-25 00:00:00+02:00


In [207]:
df1_campaigns[df1_campaigns["name"]=="Champ social mars 2022"].duplicated(subset="name", keep=False)

175    True
316    True
dtype: bool

In [226]:
# to be synced 

share_campaigns_to_be_synced = round(100 * df1_campaigns["to_be_synced"].mean(),2)
print(f"Share of campaigns to synce : {share_campaigns_to_be_synced} % ") # 0.5% of campaigns to synce

Share of campaigns to synce : 0.52 % 


In [235]:
# focus : campaigns to synce - 5 cases
# la date d'envoie semble cohérente. Pas d'observation particulière sur ces cas ...

df1_campaigns[df1_campaigns["to_be_synced"]]

Unnamed: 0,id,name,service_id,created_at,updated_at,process_id,report_url,category,to_be_synced,identifier,sent_at
43,1320752,dre_alors_on_sort0712_tech&cult1212_lesreveill...,1019,2022-11-28 09:30:31.189207+01:00,2022-12-02 17:51:23.474745+01:00,,,0.0,True,03e0704b5690a2dee1861dc3ad3316c9,2022-11-28 00:00:00+01:00
79,1320755,News hebdo du 5 au 4 décembre 2022,1060,2022-12-04 18:01:29.971417+01:00,2022-12-04 18:01:30.037656+01:00,,,0.0,True,299a23a2291e2126b91d54f3601ec162,2022-12-04 19:01:27+01:00
464,1320749,dre_le_sel_241122,1054,2022-11-24 09:01:37.467710+01:00,2022-12-02 17:51:23.622812+01:00,,,0.0,True,db576a7d2453575f29eab4bac787b919,2022-11-24 00:00:00+01:00
465,1320751,News hebdo du 28 novembre au 4 décembre,1057,2022-11-27 18:01:44.546081+01:00,2022-12-02 17:51:23.627178+01:00,,,0.0,True,d8700cbd38cc9f30cecb34f0c195b137,2022-11-27 00:00:00+01:00
888,1319474,ddcp_promo_temps fort salammbo,670,2021-11-25 13:19:41.547780+01:00,2022-02-03 14:17:27.728648+01:00,,,0.0,True,17c276c8e723eb46aef576537e9d56d0,2021-11-25 00:00:00+01:00


In [234]:
df1_campaigns[df1_campaigns["name"].isin(df1_campaigns[df1_campaigns["to_be_synced"]]["name"].unique()) ]

Unnamed: 0,id,name,service_id,created_at,updated_at,process_id,report_url,category,to_be_synced,identifier,sent_at
43,1320752,dre_alors_on_sort0712_tech&cult1212_lesreveill...,1019,2022-11-28 09:30:31.189207+01:00,2022-12-02 17:51:23.474745+01:00,,,0.0,True,03e0704b5690a2dee1861dc3ad3316c9,2022-11-28 00:00:00+01:00
79,1320755,News hebdo du 5 au 4 décembre 2022,1060,2022-12-04 18:01:29.971417+01:00,2022-12-04 18:01:30.037656+01:00,,,0.0,True,299a23a2291e2126b91d54f3601ec162,2022-12-04 19:01:27+01:00
464,1320749,dre_le_sel_241122,1054,2022-11-24 09:01:37.467710+01:00,2022-12-02 17:51:23.622812+01:00,,,0.0,True,db576a7d2453575f29eab4bac787b919,2022-11-24 00:00:00+01:00
465,1320751,News hebdo du 28 novembre au 4 décembre,1057,2022-11-27 18:01:44.546081+01:00,2022-12-02 17:51:23.627178+01:00,,,0.0,True,d8700cbd38cc9f30cecb34f0c195b137,2022-11-27 00:00:00+01:00
888,1319474,ddcp_promo_temps fort salammbo,670,2021-11-25 13:19:41.547780+01:00,2022-02-03 14:17:27.728648+01:00,,,0.0,True,17c276c8e723eb46aef576537e9d56d0,2021-11-25 00:00:00+01:00


In [237]:
df1_campaigns[~df1_campaigns["to_be_synced"]].head(10)

Unnamed: 0,id,name,service_id,created_at,updated_at,process_id,report_url,category,to_be_synced,identifier,sent_at
0,1319613,newsletter enseignants janvier 2022,721,2022-01-14 16:06:42.586321+01:00,2022-02-03 14:17:27.112963+01:00,,,0.0,False,aba3b6fd5d186d28e06ff97135cade7f,2022-01-14 00:00:00+01:00
1,1319586,lsf_janvier_2022,717,2022-01-07 11:30:35.315895+01:00,2022-02-03 14:17:27.116171+01:00,,,0.0,False,788d986905533aba051261497ecffcbb,2022-01-07 00:00:00+01:00
2,1319282,Invitation à déjeuner au Mucem | Vernissage « ...,591,2021-09-28 12:50:24.448752+02:00,2022-02-03 14:17:27.119582+01:00,,,0.0,False,3493894fa4ea036cfc6433c3e2ee63b0,2021-09-28 00:00:00+02:00
3,1319283,Vacances de la Toussaint - centres des loisirs,590,2021-09-28 18:01:04.692073+02:00,2022-02-03 14:17:27.124408+01:00,,,0.0,False,08b255a5d42b89b0585260b6f2360bdd,2021-09-28 00:00:00+02:00
4,1319636,ddcp_promo_md_livemag,730,2022-01-27 18:00:41.053069+01:00,2022-02-03 14:17:27.127607+01:00,,,0.0,False,d5cfead94f5350c12c322b5b664544c1,2022-01-27 00:00:00+01:00
5,1319614,News hebdo du 17 janv au 23 janv 2022,712,2022-01-16 18:01:28.974157+01:00,2022-02-03 14:17:27.130944+01:00,,,0.0,False,19bc916108fc6938f52cb96f7e087941,2022-01-16 00:00:00+01:00
6,1319263,ddcp_promo_automne_littérature_relance_nn_ouverts,586,2021-09-24 15:00:04.174247+02:00,2021-09-24 16:13:10.505400+02:00,,,0.0,False,605ff764c617d3cd28dbbdd72be8f9a2,2021-09-24 00:00:00+02:00
7,1319284,"Invitation au vernissage de l'exposition ""La C...",593,2021-09-30 14:47:18.135394+02:00,2022-02-03 14:17:27.134073+01:00,,,0.0,False,acc3e0404646c57502b480dc052c4fe1,2021-09-30 00:00:00+02:00
8,1319625,dre_mobilisations_artistiques_et_politiques,704,2022-01-27 10:01:16.716706+01:00,2022-02-03 14:17:27.172039+01:00,,,0.0,False,f64eac11f2cd8f0efa196f8ad173178e,2022-01-27 00:00:00+01:00
9,1319285,ddcp_promo_soyinka_taubira_infos_pratiques,594,2021-10-01 12:16:57.031796+02:00,2022-02-03 14:17:27.137444+01:00,,,0.0,False,076a0c97d09cf1a0ec3e19c7f2529f2b,2021-10-01 00:00:00+02:00


In [240]:
# 2. campaigns stats

df1_campaign_stats.head(10)

Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at
0,19793,58,112597,,2021-03-28 18:01:09+02:00,2021-03-28 18:24:18+02:00,2021-03-28 18:34:20.616136+02:00,2022-04-15 22:52:04.397693+02:00
1,14211,58,113666,,2021-03-28 18:01:09+02:00,2021-03-28 18:21:02+02:00,2021-03-28 18:21:04.297213+02:00,2022-04-15 22:52:04.397693+02:00
2,13150,58,280561,,2021-03-28 18:00:59+02:00,2021-03-28 18:08:45+02:00,2021-03-28 18:18:49.991042+02:00,2022-04-15 22:52:04.397693+02:00
3,7073,58,101007,2021-03-28 20:11:06+02:00,2021-03-28 18:00:59+02:00,2021-03-28 18:09:47+02:00,2021-03-28 18:09:50.915354+02:00,2022-04-15 22:52:04.397693+02:00
4,5175,58,103972,,2021-03-28 18:01:06+02:00,2021-03-28 18:05:03+02:00,2021-03-28 18:05:08.507398+02:00,2022-04-15 22:52:04.397693+02:00
5,4809,58,104599,2021-03-28 18:12:12+02:00,2021-03-28 18:01:06+02:00,2021-03-28 18:04:18+02:00,2021-03-28 18:04:19.662496+02:00,2022-04-15 22:52:04.397693+02:00
6,11605,58,280579,2021-03-28 18:16:14+02:00,2021-03-28 18:00:59+02:00,2021-03-28 18:16:09+02:00,2021-03-28 18:16:10.974208+02:00,2022-04-15 22:52:04.397693+02:00
7,18714,58,34173,2021-03-29 05:31:37+02:00,2021-03-28 18:00:58+02:00,2021-03-28 18:31:02+02:00,2021-03-28 18:31:07.619032+02:00,2022-04-15 22:52:04.397693+02:00
8,17119,58,34992,,2021-03-28 18:00:58+02:00,2021-03-28 18:28:00+02:00,2021-03-28 18:28:03.574600+02:00,2022-04-15 22:52:04.397693+02:00
9,14001,58,35343,,2021-03-28 18:00:58+02:00,2021-03-28 18:20:48+02:00,2021-03-28 18:20:49.258826+02:00,2022-04-15 22:52:04.397693+02:00


In [242]:
df1_campaign_stats.isna().sum() / df1_campaign_stats.shape[0]

id              0.000000
campaign_id     0.000000
customer_id     0.000000
opened_at       0.807672
sent_at         0.000969
delivered_at    0.021495
created_at      0.000000
updated_at      0.000000
dtype: float64

In [243]:
df1_campaign_stats.dtypes

id               int64
campaign_id      int64
customer_id      int64
opened_at       object
sent_at         object
delivered_at    object
created_at      object
updated_at      object
dtype: object

In [244]:
print(df1_campaign_stats.shape[0])
print(df1_campaign_stats.nunique())

id              6214808
campaign_id         949
customer_id      130472
opened_at       1102699
sent_at          152184
delivered_at     380248
created_at      4295988
updated_at      2176478
dtype: int64

In [262]:
# 3. merge campaigns and campaigns stats

df1_campaign_stats.head()

Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at
0,19793,58,112597,,2021-03-28 18:01:09+02:00,2021-03-28 18:24:18+02:00,2021-03-28 18:34:20.616136+02:00,2022-04-15 22:52:04.397693+02:00
1,14211,58,113666,,2021-03-28 18:01:09+02:00,2021-03-28 18:21:02+02:00,2021-03-28 18:21:04.297213+02:00,2022-04-15 22:52:04.397693+02:00
2,13150,58,280561,,2021-03-28 18:00:59+02:00,2021-03-28 18:08:45+02:00,2021-03-28 18:18:49.991042+02:00,2022-04-15 22:52:04.397693+02:00
3,7073,58,101007,2021-03-28 20:11:06+02:00,2021-03-28 18:00:59+02:00,2021-03-28 18:09:47+02:00,2021-03-28 18:09:50.915354+02:00,2022-04-15 22:52:04.397693+02:00
4,5175,58,103972,,2021-03-28 18:01:06+02:00,2021-03-28 18:05:03+02:00,2021-03-28 18:05:08.507398+02:00,2022-04-15 22:52:04.397693+02:00


In [273]:
# étape préalable au merge : les identifiants de campagne des deux tabes sont - ils égaux ?

id_campaigns = np.sort(df1_campaigns["id"].unique())
id_campaigns_stats = np.sort(df1_campaign_stats["campaign_id"].unique())
np.setdiff1d(id_campaigns, id_campaigns_stats)

array([1319243, 1319245, 1319247, 1319248, 1319250, 1319259, 1319260,
       1319262])

In [275]:
# ci-dessous des campagnes sans customer associé dans la table
# elles seront retirées lors du merge car pas utiles à notre étude
# on fera un merge à gauche en se basant sur campaign_stats 

df1_campaigns[df1_campaigns["id"].isin([1319243, 1319245, 1319247, 1319248, 1319250, 1319259, 1319260,
       1319262])]

Unnamed: 0,id,name,service_id,created_at,updated_at,process_id,report_url,category,to_be_synced,identifier,sent_at
789,1319243,DRE Exposer le récit 13 mars,111,2021-09-24 11:56:09.307905+02:00,2021-09-24 11:56:09.307905+02:00,,,0.0,False,698d51a19d8a121ce581499d7b701668,2020-03-03 00:00:00+01:00
791,1319245,SDR Relance invit petit dej voyage voyages,109,2021-09-24 11:56:09.323919+02:00,2021-09-24 11:56:09.323919+02:00,,,0.0,False,2723d092b63885e0d7c260cc007e8b9d,2020-02-24 00:00:00+01:00
793,1319247,Au Mucem en 2020,97,2021-09-24 11:56:09.339127+02:00,2021-09-24 11:56:09.339127+02:00,,,0.0,False,e2ef524fbf3d9fe611d5a8e90fefdc9c,2020-01-31 00:00:00+01:00
794,1319248,DRE Giono,92,2021-09-24 11:56:09.346887+02:00,2021-09-24 11:56:09.346887+02:00,,,0.0,False,92cc227532d17e56e07902b254dfad10,2020-01-29 00:00:00+01:00
796,1319250,"Portes ouvertes ""Voyage, voyages"" au Mucem | M...",77,2021-09-24 11:56:09.362114+02:00,2021-09-24 11:56:09.362114+02:00,,,0.0,False,28dd2c7955ce926456240b2ff0100bde,2020-01-13 00:00:00+01:00
805,1319259,"Save the date | Vernissage ""Voyage, voyages"" a...",38,2021-09-24 11:56:09.432720+02:00,2021-09-24 11:56:09.432720+02:00,,,0.0,False,a5771bce93e200c36f7cd9dfd0e5deaa,2019-11-20 00:00:00+01:00
806,1319260,"Portes ouvertes ""Massilia Toy"" au Mucem | Merc...",37,2021-09-24 11:56:09.440465+02:00,2021-09-24 11:56:09.440465+02:00,,,0.0,False,a5bfc9e07964f8dddeb95fc584cd965d,2019-11-20 00:00:00+01:00
808,1319262,TENK S-1 Corse,17,2021-09-24 11:56:09.456460+02:00,2021-09-24 11:56:09.456460+02:00,,,0.0,False,70efdf2ec9b086079795c442636b55fb,2019-11-07 00:00:00+01:00


In [338]:
# merge 

# de campaigns on supprile les var valant tjrs NaN et to_be_synced qui semble pas très informatif

df1_campaigns_full = pd.merge(df1_campaign_stats, 
                              df1_campaigns[["id", "name", "service_id", "created_at", "updated_at", "sent_at", "identifier"]].add_prefix("campaign_"),
                              on = "campaign_id", how = "left")
df1_campaigns_full.head()

Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at,campaign_name,campaign_service_id,campaign_created_at,campaign_updated_at,campaign_sent_at,campaign_identifier
0,19793,58,112597,,2021-03-28 18:01:09+02:00,2021-03-28 18:24:18+02:00,2021-03-28 18:34:20.616136+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
1,14211,58,113666,,2021-03-28 18:01:09+02:00,2021-03-28 18:21:02+02:00,2021-03-28 18:21:04.297213+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
2,13150,58,280561,,2021-03-28 18:00:59+02:00,2021-03-28 18:08:45+02:00,2021-03-28 18:18:49.991042+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
3,7073,58,101007,2021-03-28 20:11:06+02:00,2021-03-28 18:00:59+02:00,2021-03-28 18:09:47+02:00,2021-03-28 18:09:50.915354+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
4,5175,58,103972,,2021-03-28 18:01:06+02:00,2021-03-28 18:05:03+02:00,2021-03-28 18:05:08.507398+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a


In [328]:
df1_campaigns_full.isna().sum()

id                           0
campaign_id                  0
customer_id                  0
opened_at              5019527
sent_at                   6023
delivered_at            133590
created_at                   0
updated_at                   0
campaign_name                0
campaign_service_id          0
campaign_created_at          0
campaign_updated_at          0
campaign_sent_at             6
campaign_identifier          0
dtype: int64

In [297]:
# lien entre sent at et campaign sent at ? 
# à quoi correspond la date de la campagne, est-ce le premier envoi à un client ?

# first step : transform dates to have the good format
# VERY time-consuming bc the df has 6M lines !!!!

from dateutil import parser

def convert_to_datetime(column):
    return column.apply(lambda x: parser.parse(str(x)) if pd.notna(x) else pd.NaT)

# Liste des colonnes à convertir
columns_to_convert = ["sent_at", "delivered_at", "created_at", "updated_at", 
                      "campaign_sent_at", "campaign_created_at", "campaign_updated_at"]

# Appliquer la fonction à chaque colonne spécifiée
df1_campaigns_full[columns_to_convert] = df1_campaigns_full[columns_to_convert].apply(convert_to_datetime)




In [329]:
# Exemple d'élément
date_string = '2021-03-28 18:01:09+02:00'

# Convertir en datetime en utilisant pd.to_datetime avec utc=True
datetime_object_utc = pd.to_datetime(date_string, utc=True)
print("UTC:", datetime_object_utc)

# Convertir en datetime en utilisant pd.to_datetime avec utc=False (ou sans spécifier utc)
datetime_object_local = pd.to_datetime(date_string, utc=False)
print("Local:", datetime_object_local)

UTC: 2021-03-28 16:01:09+00:00
Local: 2021-03-28 18:01:09+02:00


In [332]:
import pandas as pd

# Exemple d'élément
date_string = '2021-03-28 18:00:00+02:00'

# Convertir en datetime en utilisant pd.to_datetime avec utc=True
datetime_object_utc = pd.to_datetime(date_string, utc=True)

# Afficher l'objet datetime en UTC
print("UTC:", datetime_object_utc)

# Effectuer un calcul de différence entre deux dates en UTC
other_date_string = '2021-03-28 20:30:00+03:00'
other_datetime_object_utc = pd.to_datetime(other_date_string, utc=True)

# Calculer la différence entre les dates
time_difference = other_datetime_object_utc - datetime_object_utc

# Afficher la différence
print("Différence en heures:", time_difference.total_seconds() / 3600)


UTC: 2021-03-28 16:00:00+00:00
Différence en heures: 1.5


In [321]:
# etape supp pour s'assurer que les dates non convertibles sont bien des Nan

df1_campaigns_full[columns_to_convert] = df1_campaigns_full[columns_to_convert].apply(pd.to_datetime, errors='coerce')
df1_campaigns_full.head()

Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at,campaign_name,campaign_service_id,campaign_created_at,campaign_updated_at,campaign_sent_at,campaign_identifier
0,19793,58,112597,,2021-03-28 18:01:09+02:00,2021-03-28 18:24:18+02:00,2021-03-28 18:34:20.616136+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
1,14211,58,113666,,2021-03-28 18:01:09+02:00,2021-03-28 18:21:02+02:00,2021-03-28 18:21:04.297213+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
2,13150,58,280561,,2021-03-28 18:00:59+02:00,2021-03-28 18:08:45+02:00,2021-03-28 18:18:49.991042+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
3,7073,58,101007,2021-03-28 20:11:06+02:00,2021-03-28 18:00:59+02:00,2021-03-28 18:09:47+02:00,2021-03-28 18:09:50.915354+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
4,5175,58,103972,,2021-03-28 18:01:06+02:00,2021-03-28 18:05:03+02:00,2021-03-28 18:05:08.507398+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a


In [333]:
# autre methode

df1_campaigns_full["sent_at"] = pd.to_datetime(df1_campaigns_full["sent_at"] , utc=False).astype('datetime64[ns]')

  df1_campaigns_full["sent_at"] = pd.to_datetime(df1_campaigns_full["sent_at"] , utc=False).astype('datetime64[ns]')


ValueError: Tz-aware datetime.datetime cannot be converted to datetime64 unless utc=True, at position 18

In [334]:
import pandas as pd

# Exemple de DataFrame avec une colonne 'sent_at' contenant des dates en format string
df1_campaigns_full = pd.DataFrame({
    'sent_at': ['2022-01-01 12:34:56+02:00', '2022-02-01 15:45:30+02:00', '2022-03-01 18:30:00+02:00']
})

# Convertir la colonne 'sent_at' en datetime en conservant l'information sur le fuseau horaire (datetime64[ns])
df1_campaigns_full['sent_at'] = pd.to_datetime(df1_campaigns_full['sent_at'], utc=True)

# Afficher le DataFrame résultant
print(df1_campaigns_full)


                    sent_at
0 2022-01-01 10:34:56+00:00
1 2022-02-01 13:45:30+00:00
2 2022-03-01 16:30:00+00:00


In [324]:
# comparison 

print(df1_campaigns_full.groupby("campaign_id")["campaign_sent_at"].first()) # envoi des campagnes
print(df1_campaigns_full.groupby("campaign_id")["sent_at"].dropna().min())

campaign_id
1         2021-03-24 00:00:00+01:00
2         2021-03-14 00:00:00+01:00
3         2021-03-15 00:00:00+01:00
4         2021-03-21 00:00:00+01:00
5         2021-03-10 00:00:00+01:00
                     ...           
1321501   2023-11-06 13:30:12+01:00
1321503   2023-11-07 17:31:16+01:00
1321505   2023-11-08 11:15:52+01:00
1321506   2023-11-08 19:00:25+01:00
1321507   2023-11-08 19:00:37+01:00
Name: campaign_sent_at, Length: 949, dtype: datetime64[ns, tzoffset(None, 3600)]


TypeError: 'bool' object is not callable

In [325]:
df1_campaigns_full.isna().sum()

id                           0
campaign_id                  0
customer_id                  0
opened_at              5019527
sent_at                2741358
delivered_at           2807002
created_at             1547090
updated_at              766803
campaign_name                0
campaign_service_id          0
campaign_created_at    2216183
campaign_updated_at    2561268
campaign_sent_at       3504140
campaign_identifier          0
dtype: int64

In [326]:
df1_campaigns.isna().sum()

id                0
name              0
service_id        0
created_at        0
updated_at        0
process_id      957
report_url      957
category          2
to_be_synced      0
identifier        0
sent_at           3
dtype: int64

In [320]:
# df1_campaigns_full["sent_at"] = 
print(pd.to_datetime(df1_campaigns_full["sent_at"], errors='coerce').min())
print(df1_campaigns_full["sent_at"].dropna().min())

2020-06-02 10:24:08+02:00
2020-06-02 10:24:08+02:00


In [313]:
df1_campaigns_full["sent_at"].apply(lambda x : isinstance(x, datetime)).sum()
# df1_campaigns_full["sent_at"].tail(30)

6214808

In [314]:
df1_campaigns_full.shape[0]

6214808

In [340]:
df1_campaigns_full.dtypes

id                      int64
campaign_id             int64
customer_id             int64
opened_at              object
sent_at                object
delivered_at           object
created_at             object
updated_at             object
campaign_name          object
campaign_service_id     int64
campaign_created_at    object
campaign_updated_at    object
campaign_sent_at       object
campaign_identifier    object
dtype: object

In [341]:
df1_campaigns_full.head()

Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at,campaign_name,campaign_service_id,campaign_created_at,campaign_updated_at,campaign_sent_at,campaign_identifier
0,19793,58,112597,,2021-03-28 18:01:09+02:00,2021-03-28 18:24:18+02:00,2021-03-28 18:34:20.616136+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
1,14211,58,113666,,2021-03-28 18:01:09+02:00,2021-03-28 18:21:02+02:00,2021-03-28 18:21:04.297213+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
2,13150,58,280561,,2021-03-28 18:00:59+02:00,2021-03-28 18:08:45+02:00,2021-03-28 18:18:49.991042+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
3,7073,58,101007,2021-03-28 20:11:06+02:00,2021-03-28 18:00:59+02:00,2021-03-28 18:09:47+02:00,2021-03-28 18:09:50.915354+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
4,5175,58,103972,,2021-03-28 18:01:06+02:00,2021-03-28 18:05:03+02:00,2021-03-28 18:05:08.507398+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a


In [342]:
# conversion colonne par colonne

# precision a la Ns
df1_campaigns_full["created_at"] = df1_campaigns_full["created_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S.%f%z") if pd.notna(x) else pd.NaT)
# df1_campaigns_full["updated_at"] = df1_campaigns_full["updated_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S.%f%z") if pd.notna(x) else pd.NaT)
# df1_campaigns_full["campaign_created_at"] = df1_campaigns_full["campaign_created_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S.%f%z") if pd.notna(x) else pd.NaT)
# df1_campaigns_full["campaign_updated_at"] = df1_campaigns_full["campaign_updated_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S.%f%z") if pd.notna(x) else pd.NaT)

# precision a la sec
# df1_campaigns_full["opened_at"] = df1_campaigns_full["opened_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S%z") if pd.notna(x) else pd.NaT)
# df1_campaigns_full["sent_at"] = df1_campaigns_full["sent_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S%z") if pd.notna(x) else pd.NaT)
# df1_campaigns_full["delivered_at"] = df1_campaigns_full["delivered_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S%z") if pd.notna(x) else pd.NaT)
# df1_campaigns_full["campaign_sent_at"] = df1_campaigns_full["campaign_sent_at"].apply(lambda x : datetime.strptime(str(x), "%Y-%m-%d %H:%M:%S.%f%z") if pd.notna(x) else pd.NaT)


ValueError: time data '2022-05-06 12:00:23+02:00' does not match format '%Y-%m-%d %H:%M:%S.%f%z'

In [346]:
df1_campaigns_full["created_at"].tail(30)

6214778    2023-10-23 09:31:50.168545+02:00
6214779    2023-10-23 09:31:28.570386+02:00
6214780    2023-10-23 09:02:26.494195+02:00
6214781    2023-10-23 09:32:34.454957+02:00
6214782    2023-10-23 09:31:29.139217+02:00
6214783    2023-10-23 09:32:06.223901+02:00
6214784    2023-10-23 09:31:52.702258+02:00
6214785    2023-10-23 09:31:45.051321+02:00
6214786    2023-10-23 09:32:55.350092+02:00
6214787    2023-10-23 09:33:14.007405+02:00
6214788    2023-10-23 09:32:44.645432+02:00
6214789    2023-10-23 09:02:27.578671+02:00
6214790    2023-10-23 09:34:24.879045+02:00
6214791    2023-10-23 09:34:02.075066+02:00
6214792    2023-10-23 09:33:20.349918+02:00
6214793    2023-10-23 09:34:25.631234+02:00
6214794    2023-10-23 09:34:27.581150+02:00
6214795    2023-10-23 09:31:45.192200+02:00
6214796    2023-10-23 09:32:52.018890+02:00
6214797    2023-10-23 09:02:01.558573+02:00
6214798    2023-10-23 09:34:48.543213+02:00
6214799    2023-10-23 09:32:15.109097+02:00
6214800    2023-10-23 09:34:26.5

In [349]:
import pandas as pd

# Exemple de DataFrame avec une colonne 'date_str' contenant des dates en formats différents
df = pd.DataFrame({
    'date_str': ['2022-05-06 12:00:23+02:00', '2023-10-23 09:31:50.168545+02:00']
})

# Convertir la colonne 'date_str' en datetime en conservant l'information sur le fuseau horaire (datetime64[ns])
df['date'] = pd.to_datetime(df['date_str'], utc=True)

# Afficher le DataFrame résultant
print(df)


ValueError: time data "2023-10-23 09:31:50.168545+02:00" doesn't match format "%Y-%m-%d %H:%M:%S%z", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [350]:

# Exemple de DataFrame avec une colonne 'date_str' contenant des dates en formats différents
df = pd.DataFrame({
    'date_str': ['2022-05-06 12:00:23+02:00', '023-10-23 09:31:50.168545+02:00']
})

# Fonction lambda pour convertir la colonne 'date_str' en datetime avec précision
def convert_to_datetime_with_precision(x):
    if pd.notna(x):
        # Format avec nanosecondes
        try:
            return pd.to_datetime(x, utc=True)
        except ValueError:
            pass

        # Format sans nanosecondes
        try:
            return pd.to_datetime(x, utc=True, format="%Y-%m-%d %H:%M:%S%z")
        except ValueError:
            pass

    return x

# Appliquer la fonction lambda à la colonne 'date_str'
df['date'] = df['date_str'].apply(convert_to_datetime_with_precision)

# Afficher le DataFrame résultant
print(df)



                          date_str                             date
0        2022-05-06 12:00:23+02:00        2022-05-06 10:00:23+00:00
1  023-10-23 09:31:50.168545+02:00  023-10-23 09:31:50.168545+02:00


In [351]:
# loop over all dates to convert 

columns_to_convert = ["sent_at", "delivered_at", "created_at", "updated_at", 
                      "campaign_sent_at", "campaign_created_at", "campaign_updated_at"]

for column in columns_to_convert :
    df1_campaigns_full[column] = df1_campaigns_full[column].apply(convert_to_datetime_with_precision)

KeyboardInterrupt: 

In [356]:
# tests

df1_campaigns_full[df1_campaigns_full["campaign_id"]==58].sort_values("sent_at")

Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at,campaign_name,campaign_service_id,campaign_created_at,campaign_updated_at,campaign_sent_at,campaign_identifier
4081002,23728,58,8268,,2021-03-28 18:00:57+02:00,2021-03-28 18:43:38+02:00,2021-03-28 18:43:42.928685+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
4076139,4552,58,1472,,2021-03-28 18:00:57+02:00,2021-03-28 18:03:26+02:00,2021-03-28 18:03:28.229670+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
4081572,14070,58,7978,2021-03-29 08:38:06+02:00,2021-03-28 18:00:57+02:00,2021-03-28 18:20:45+02:00,2021-03-28 18:20:49.431860+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
4094833,6969,58,32211,,2021-03-28 18:00:57+02:00,2021-03-28 18:09:18+02:00,2021-03-28 18:09:20.571462+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
4094827,9662,58,30980,2021-04-04 17:54:51+02:00,2021-03-28 18:00:57+02:00,2021-03-28 18:03:29+02:00,2021-03-28 18:13:33.153720+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8961,12417,58,33400,2021-03-28 21:27:57+02:00,2021-03-28 18:17:35+02:00,2021-03-28 18:17:36+02:00,2021-03-28 18:17:36.735495+02:00,2021-03-28 19:27:57.503961+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
20380,18205,58,106495,,2021-03-28 18:30:08+02:00,2021-03-28 18:30:11+02:00,2021-03-28 18:30:11.453742+02:00,2021-03-28 18:30:11.474019+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
27298,22107,58,104781,,2021-03-28 18:39:55+02:00,2021-03-28 18:39:56+02:00,2021-03-28 18:39:56.430679+02:00,2021-03-28 18:39:56.435656+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
29107,22389,58,111570,,2021-03-28 18:40:38+02:00,2021-03-28 18:40:40+02:00,2021-03-28 18:40:40.975334+02:00,2021-03-28 18:40:40.979852+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a


In [364]:
df1_campaigns_full[df1_campaigns_full["campaign_id"]==630525].sort_values("sent_at")

Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at,campaign_name,campaign_service_id,campaign_created_at,campaign_updated_at,campaign_sent_at,campaign_identifier
1681,571478,630525,291045,2021-06-18 14:23:57+02:00,2021-06-17 00:01:05+02:00,2021-06-17 18:15:02+02:00,2021-06-17 19:11:05.780774+02:00,2022-04-15 23:11:44.290919+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2
868,571425,630525,272258,,2021-06-17 00:01:05+02:00,2021-06-17 18:14:37+02:00,2021-06-17 19:10:59.410221+02:00,2022-04-15 23:11:44.290919+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2
488660,574486,630525,284414,,2021-06-17 00:01:05+02:00,2021-06-17 19:18:30+02:00,2021-06-17 19:24:37.325550+02:00,2022-04-15 23:11:44.290919+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2
453503,553818,630525,280714,,2021-06-17 00:01:05+02:00,2021-06-17 07:18:06+02:00,2021-06-17 07:18:06.816543+02:00,2022-04-15 23:11:44.290919+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2
600459,556431,630525,289484,,2021-06-17 00:01:05+02:00,2021-06-17 10:18:57+02:00,2021-06-17 10:18:57.692035+02:00,2022-04-15 23:11:44.290919+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514091,566709,630525,112554,,2021-06-17 14:00:35+02:00,2021-06-17 14:00:39+02:00,2021-06-17 14:00:39.523170+02:00,2021-06-17 14:00:39.551198+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2
604901,569106,630525,33100,,2021-06-17 16:36:55+02:00,2021-06-17 16:36:55+02:00,2021-06-17 16:36:55.928814+02:00,2021-06-17 16:36:55.933170+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2
478955,572372,630525,119502,,2021-06-17 18:25:17+02:00,2021-06-17 18:25:20+02:00,2021-06-17 19:13:02.489176+02:00,2021-06-17 19:13:02.520644+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2
477252,572282,630525,33826,,2021-06-17 18:25:21+02:00,2021-06-17 18:25:26+02:00,2021-06-17 19:13:01.993836+02:00,2021-06-17 19:13:02.006886+02:00,com_ddcp_campagne_de_qualification_contacts__n...,474,2021-06-17 00:02:11.388346+02:00,2021-09-24 11:56:08.931051+02:00,2021-06-17 00:00:00+02:00,25ddc0f8c9d3e22e03d3076f98d83cb2


## Link stats

In [238]:
df1_link_stats

Unnamed: 0,id,clicked_at,link_id,customer_id,created_at,updated_at
0,1,2021-03-26 16:30:36+01:00,1,284033,2021-03-26 15:30:37.050161+01:00,2021-03-26 15:30:37.050161+01:00
1,2,2021-03-26 17:16:34+01:00,2,119768,2021-03-26 16:16:34.950871+01:00,2021-03-26 16:16:34.950871+01:00
2,272,2021-03-28 20:03:32+02:00,42,113105,2021-03-28 18:03:32.736394+02:00,2021-03-28 18:03:32.736394+02:00
3,4,2021-03-26 17:43:19+01:00,3,272280,2021-03-26 16:43:19.338321+01:00,2021-03-26 16:43:19.338321+01:00
4,5,2021-03-26 17:46:00+01:00,3,105095,2021-03-26 16:46:00.502945+01:00,2021-03-26 16:46:00.502945+01:00
...,...,...,...,...,...,...
151046,243553,2023-11-09 16:34:27+01:00,14666,998,2023-11-09 15:34:29.425425+01:00,2023-11-09 15:34:29.425425+01:00
151047,243554,2023-11-09 16:34:35+01:00,14670,998,2023-11-09 15:34:37.505505+01:00,2023-11-09 15:34:37.505505+01:00
151048,243559,2023-11-09 16:51:15+01:00,14686,82923,2023-11-09 15:51:17.439518+01:00,2023-11-09 15:51:17.439518+01:00
151049,243561,2023-11-09 16:59:42+01:00,14677,82923,2023-11-09 15:59:44.030922+01:00,2023-11-09 15:59:44.030922+01:00


In [365]:
# share of Nan for every variable

df1_link_stats.isna().sum() / df1_link_stats.shape[0]

id             0.0
clicked_at     0.0
link_id        0.0
customer_id    0.0
created_at     0.0
updated_at     0.0
dtype: float64

In [366]:
# types of the variables 

df1_link_stats.dtypes

id              int64
clicked_at     object
link_id         int64
customer_id     int64
created_at     object
updated_at     object
dtype: object

In [367]:
# number of unique modalities

df1_link_stats.nunique()

id             151051
clicked_at     137121
link_id         10788
customer_id     26075
created_at      96565
updated_at      96565
dtype: int64

In [378]:
len(sorted(df1_link_stats["link_id"].unique()))

10788

In [379]:
len(sorted(df1_campaigns_full["campaign_id"].unique()))

949

In [380]:
df1_campaigns_full

Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at,campaign_name,campaign_service_id,campaign_created_at,campaign_updated_at,campaign_sent_at,campaign_identifier
0,19793,58,112597,,2021-03-28 18:01:09+02:00,2021-03-28 18:24:18+02:00,2021-03-28 18:34:20.616136+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
1,14211,58,113666,,2021-03-28 18:01:09+02:00,2021-03-28 18:21:02+02:00,2021-03-28 18:21:04.297213+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
2,13150,58,280561,,2021-03-28 18:00:59+02:00,2021-03-28 18:08:45+02:00,2021-03-28 18:18:49.991042+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
3,7073,58,101007,2021-03-28 20:11:06+02:00,2021-03-28 18:00:59+02:00,2021-03-28 18:09:47+02:00,2021-03-28 18:09:50.915354+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
4,5175,58,103972,,2021-03-28 18:01:06+02:00,2021-03-28 18:05:03+02:00,2021-03-28 18:05:08.507398+02:00,2022-04-15 22:52:04.397693+02:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-28 18:01:45.448313+02:00,2021-09-24 11:56:07.723413+02:00,2021-03-28 00:00:00+01:00,4f4adcbf8c6f66dcfc8a3282ac2bf10a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6214803,8302994,1321483,266155,2023-10-23 11:43:25+02:00,2023-10-23 11:32:33+02:00,2023-10-23 11:32:34+02:00,2023-10-23 09:32:36.564696+02:00,2023-10-23 09:43:28.038259+02:00,dre_nov_2023,1318,2023-10-23 09:31:19.927528+02:00,2023-10-23 09:31:20.033243+02:00,2023-10-23 11:31:17+02:00,76cf99d3614e23eabab16fb27e944bf9
6214804,8303307,1321483,21355,2023-10-23 11:44:02+02:00,2023-10-23 11:32:49+02:00,2023-10-23 11:32:49+02:00,2023-10-23 09:32:50.829641+02:00,2023-10-23 09:44:04.119578+02:00,dre_nov_2023,1318,2023-10-23 09:31:19.927528+02:00,2023-10-23 09:31:20.033243+02:00,2023-10-23 11:31:17+02:00,76cf99d3614e23eabab16fb27e944bf9
6214805,8304346,1321483,21849,2023-10-23 11:45:52+02:00,2023-10-23 11:33:28+02:00,2023-10-23 11:33:29+02:00,2023-10-23 09:33:31.102500+02:00,2023-10-23 09:45:55.927652+02:00,dre_nov_2023,1318,2023-10-23 09:31:19.927528+02:00,2023-10-23 09:31:20.033243+02:00,2023-10-23 11:31:17+02:00,76cf99d3614e23eabab16fb27e944bf9
6214806,8302037,1321483,667789,2023-10-23 11:47:32+02:00,2023-10-23 11:31:53+02:00,2023-10-23 11:31:54+02:00,2023-10-23 09:31:55.768547+02:00,2023-10-23 09:47:33.915460+02:00,dre_nov_2023,1318,2023-10-23 09:31:19.927528+02:00,2023-10-23 09:31:20.033243+02:00,2023-10-23 11:31:17+02:00,76cf99d3614e23eabab16fb27e944bf9
