Compare commits
136 Commits
correction
...
main
Author | SHA1 | Date | |
---|---|---|---|
4ed6bd809d | |||
9ca22fb9e7 | |||
6da3467108 | |||
473f8100b0 | |||
68b68ed3da | |||
f5b6075431 | |||
1ebb83e3c4 | |||
e54e6c3b10 | |||
df4c28bdd8 | |||
09f4bd3fe4 | |||
b9aa0d7578 | |||
5fa57cb4b9 | |||
0f5c9cb70f | |||
7bf011e2ed | |||
f4b430dbc1 | |||
7d7683b0a9 | |||
d14174dc07 | |||
c5aca36640 | |||
a3caa64c95 | |||
15f950d87f | |||
acf7621d9a | |||
14953b031a | |||
ea3dcbb015 | |||
091693c889 | |||
197703a857 | |||
41decc7acd | |||
a21805db9b | |||
21bf0c8408 | |||
4e74483a69 | |||
c96e1b5f0c | |||
52b39e03be | |||
b9a3d05a2f | |||
1a62d2b60a | |||
e5c99f09ab | |||
1577cc3291 | |||
ad1e9034f7 | |||
8e61e9d2a4 | |||
7341752be0 | |||
35638f2a2d | |||
0a7900c07f | |||
78aab14164 | |||
8485bd755e | |||
354f6847b6 | |||
d6e2b2c57a | |||
42b4414a16 | |||
3d6414728c | |||
7be4179de4 | |||
4facf5567c | |||
7ed8516009 | |||
56ee61e25f | |||
0aed0911a1 | |||
d5ab3c2d68 | |||
0bd29e3a81 | |||
840ce876e2 | |||
b268cd980d | |||
02a4ea20dd | |||
25a356d6a4 | |||
f0f69d710a | |||
eb87cc6998 | |||
7debe6590e | |||
122c4c1f82 | |||
ebdbacbe34 | |||
adc62dd056 | |||
6d40cfe261 | |||
10824e5e24 | |||
be0bcda0ba | |||
0ffbe06b12 | |||
d3e13f4c56 | |||
cf0b33c940 | |||
905072b1db | |||
bfa941f0a3 | |||
8d33c74d2f | |||
f55ade48b4 | |||
10fde045e5 | |||
133eb83e84 | |||
f4b6f23394 | |||
38c3fc3148 | |||
e2d55e557e | |||
dd5c3f416b | |||
28cc7b94ea | |||
2165c7c16e | |||
5e37dd4d3d | |||
e1f6f1ba68 | |||
c620f23507 | |||
a32cbe70e4 | |||
c86c43cc7e | |||
f5f993aba0 | |||
dbd87dadd9 | |||
ba6c4a8a24 | |||
c549752ba7 | |||
ca30d1daa3 | |||
7a9548f295 | |||
1a0a5a40cf | |||
c1cb3ab396 | |||
1c8e19a70d | |||
a88c2df8f5 | |||
33df2fda4f | |||
52fd738fe5 | |||
a85036ad23 | |||
089a8fd3d6 | |||
b1e877508b | |||
9763dfe7f9 | |||
a0256c551b | |||
5cd1bcc222 | |||
3d03965084 | |||
605876dfb1 | |||
fbfc03a572 | |||
95c4c6c4bf | |||
0a41641956 | |||
57cc7d077d | |||
d328caa665 | |||
ee86fcaf84 | |||
ef23181a05 | |||
9e5e364aa3 | |||
969cb8ec43 | |||
9155b397e9 | |||
6ac62d9957 | |||
52119c4354 | |||
2bd3edb444 | |||
b892ca79c7 | |||
9a0ac320d0 | |||
5408ce677b | |||
6eddec93bc | |||
cc30d7deb9 | |||
ab3b033f09 | |||
746f764973 | |||
4c7bdf712b | |||
f8dc99df99 | |||
14423b1d34 | |||
83a3c039ec | |||
15c102682a | |||
db6eaaaa8d | |||
54fbad0344 | |||
6d0f67bd31 | |||
4ac11c6b37 | |||
d42e81449a |
File diff suppressed because one or more lines are too long
|
@ -13,7 +13,7 @@ S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
# Import cleaning and merge functions
|
# Import cleaning and merge functions
|
||||||
exec(open('0_Cleaning_and_merge_functions.py').read())
|
exec(open('utils_cleaning_and_merge.py').read())
|
||||||
|
|
||||||
# Output folder
|
# Output folder
|
||||||
BUCKET_OUT = "projet-bdc2324-team1"
|
BUCKET_OUT = "projet-bdc2324-team1"
|
||||||
|
@ -30,7 +30,7 @@ def export_dataset(df, output_name):
|
||||||
df.to_csv(file_out, index = False)
|
df.to_csv(file_out, index = False)
|
||||||
|
|
||||||
## 1 - Cleaning of the datasets
|
## 1 - Cleaning of the datasets
|
||||||
for tenant_id in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"]:
|
for tenant_id in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"]:#, "101"
|
||||||
|
|
||||||
# Timer
|
# Timer
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -51,9 +51,6 @@ for tenant_id in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
|
||||||
## Exportation
|
## Exportation
|
||||||
export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
|
export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
|
||||||
|
|
||||||
# Exportation
|
|
||||||
export_dataset(df = df1_campaigns_information, output_name = "1_Temp/Company 1 - Campaigns dataset clean.csv")
|
|
||||||
|
|
||||||
if tenant_id == "101":
|
if tenant_id == "101":
|
||||||
# Cleaning product area
|
# Cleaning product area
|
||||||
products_purchased_reduced, products_purchased_reduced_1 = uniform_product_df(directory_path = tenant_id)
|
products_purchased_reduced, products_purchased_reduced_1 = uniform_product_df(directory_path = tenant_id)
|
|
@ -1,5 +1,8 @@
|
||||||
# Business Data Challenge - Team 1
|
# Purpose of the script : Construction of training and test datasets for modelling by company
|
||||||
|
# Input : KPI construction function and clean databases in the 0_Input folder
|
||||||
|
# Output : Train and test datasets by compagnies
|
||||||
|
|
||||||
|
# Packages
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import os
|
import os
|
||||||
|
@ -9,14 +12,12 @@ import warnings
|
||||||
from datetime import date, timedelta, datetime
|
from datetime import date, timedelta, datetime
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
# Create filesystem object
|
# Create filesystem object
|
||||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
|
||||||
# Import KPI construction functions
|
# Import KPI construction functions
|
||||||
exec(open('0_KPI_functions.py').read())
|
exec(open('utils_features_construction.py').read())
|
||||||
|
|
||||||
# Ignore warning
|
# Ignore warning
|
||||||
warnings.filterwarnings('ignore')
|
warnings.filterwarnings('ignore')
|
||||||
|
@ -24,53 +25,69 @@ warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
|
|
||||||
# Import customerplus
|
# Import of cleaned and merged datasets
|
||||||
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
||||||
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
||||||
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
||||||
|
df_target_information = display_input_databases(directory_path, file_name = "target_information")
|
||||||
# if directory_path == "101":
|
|
||||||
# df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
|
|
||||||
# df_products_purchased_reduced = pd.concat([df_products_purchased_reduced, df_products_purchased_reduced_1])
|
|
||||||
|
|
||||||
# Filtre de cohérence pour la mise en pratique de notre méthode
|
# Dates in datetime format
|
||||||
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
|
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
|
||||||
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
|
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
|
||||||
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
||||||
|
|
||||||
#Filtre de la base df_campaigns_information
|
# Filter for database df_campaigns_information
|
||||||
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
||||||
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
||||||
|
|
||||||
#Filtre de la base df_products_purchased_reduced
|
# Filter for database df_products_purchased_reduced
|
||||||
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
||||||
|
|
||||||
print("Data filtering : SUCCESS")
|
print("Data filtering : SUCCESS")
|
||||||
|
|
||||||
# Fusion de l'ensemble et creation des KPI
|
# Building and merging features
|
||||||
|
|
||||||
# KPI sur les campagnes publicitaires
|
# Campaigns features
|
||||||
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
|
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
|
||||||
|
|
||||||
# KPI sur le comportement d'achat
|
# Purchasing behavior features
|
||||||
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
|
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
|
||||||
|
|
||||||
# KPI sur les données socio-démographiques
|
# Socio-demographic features
|
||||||
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
||||||
|
|
||||||
|
# Targets features
|
||||||
|
df_targets_kpi = targets_KPI(df_target = df_target_information)
|
||||||
|
|
||||||
print("KPIs construction : SUCCESS")
|
print("KPIs construction : SUCCESS")
|
||||||
|
|
||||||
# Fusion avec KPI liés au customer
|
# Merge - campaigns features
|
||||||
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
# Fill NaN values
|
# Fill NaN values
|
||||||
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
|
df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
|
||||||
|
df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
|
||||||
|
|
||||||
# Fusion avec KPI liés au comportement d'achat
|
# Merge - targets features
|
||||||
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
|
df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
# Fill NaN values
|
# Fill NaN values
|
||||||
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
|
targets_columns = list(df_targets_kpi.columns)
|
||||||
|
targets_columns.remove('customer_id')
|
||||||
|
|
||||||
|
df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
|
||||||
|
|
||||||
|
# We standardise the number of targets closely linked to the company's operations
|
||||||
|
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
|
||||||
|
|
||||||
|
# Merge - purchasing behavior features
|
||||||
|
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
|
# Fill NaN values
|
||||||
|
special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']
|
||||||
|
simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
|
||||||
|
|
||||||
|
df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
|
||||||
|
|
||||||
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
|
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
|
||||||
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
|
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
|
||||||
|
@ -82,9 +99,9 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
print("Explanatory variable construction : SUCCESS")
|
print("Explanatory variable construction : SUCCESS")
|
||||||
|
|
||||||
# 2. Construction of the explained variable
|
# 2. Construction of the explained variable
|
||||||
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
|
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
|
||||||
|
|
||||||
# Indicatrice d'achat
|
# Construction of the dependant variable
|
||||||
df_products_purchased_to_predict['y_has_purchased'] = 1
|
df_products_purchased_to_predict['y_has_purchased'] = 1
|
||||||
|
|
||||||
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
|
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
|
||||||
|
@ -103,28 +120,24 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
## Exportation
|
## Exportation
|
||||||
|
# Sectors
|
||||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
||||||
'sport': ['5', '6', '7', '8', '9'],
|
'sport': ['5', '6', '7', '8', '9'],
|
||||||
'musique' : ['10', '11', '12', '13', '14']}
|
'musique' : ['10', '11', '12', '13', '14']}
|
||||||
|
|
||||||
|
# Choosed sector
|
||||||
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
list_of_comp = companies[type_of_comp]
|
list_of_comp = companies[type_of_comp]
|
||||||
# Dossier d'exportation
|
|
||||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
|
|
||||||
|
|
||||||
# Create test dataset and train dataset for sport companies
|
# Export folder
|
||||||
|
BUCKET_OUT = f'projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}'
|
||||||
|
|
||||||
#start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_features = 0.7)
|
|
||||||
|
|
||||||
# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
|
|
||||||
|
|
||||||
|
# Dates used for the construction of features and the dependant variable
|
||||||
start_date = "2021-05-01"
|
start_date = "2021-05-01"
|
||||||
end_of_features = "2022-11-01"
|
end_of_features = "2022-11-01"
|
||||||
final_date = "2023-11-01"
|
final_date = "2023-11-01"
|
||||||
|
|
||||||
|
# Anonymous customer to be deleted from the datasets
|
||||||
anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
|
anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
|
||||||
'5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
|
'5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
|
||||||
'10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
|
'10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
|
||||||
|
@ -133,33 +146,23 @@ for company in list_of_comp:
|
||||||
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
|
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
|
||||||
max_date = final_date, directory_path = company)
|
max_date = final_date, directory_path = company)
|
||||||
|
|
||||||
# On retire le client anonyme
|
# Deletion of the anonymous customer
|
||||||
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
|
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
|
||||||
|
|
||||||
|
# Split between train and test
|
||||||
# #train test set
|
|
||||||
# np.random.seed(42)
|
|
||||||
|
|
||||||
# split_ratio = 0.7
|
|
||||||
# split_index = int(len(dataset) * split_ratio)
|
|
||||||
# dataset = dataset.sample(frac=1).reset_index(drop=True)
|
|
||||||
# dataset_train = dataset.iloc[:split_index]
|
|
||||||
# dataset_test = dataset.iloc[split_index:]
|
|
||||||
|
|
||||||
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
|
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
|
||||||
|
|
||||||
# Dataset Test
|
# Dataset Test
|
||||||
# Exportation
|
# Export
|
||||||
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
|
||||||
|
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
dataset_test.to_csv(file_out, index = False)
|
dataset_test.to_csv(file_out, index = False)
|
||||||
|
|
||||||
print("Exportation dataset test : SUCCESS")
|
print("Export of dataset test : SUCCESS")
|
||||||
|
|
||||||
# Dataset train
|
# Dataset train
|
||||||
|
|
||||||
# Export
|
# Export
|
||||||
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
|
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
|
||||||
|
@ -167,7 +170,7 @@ for company in list_of_comp:
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
dataset_train.to_csv(file_out, index = False)
|
dataset_train.to_csv(file_out, index = False)
|
||||||
|
|
||||||
print("Exportation dataset train : SUCCESS")
|
print("Export of dataset train : SUCCESS")
|
||||||
|
|
||||||
|
|
||||||
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
|
print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")
|
File diff suppressed because one or more lines are too long
|
@ -14,14 +14,14 @@ fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
|
||||||
# Import KPI construction functions
|
# Import KPI construction functions
|
||||||
exec(open('0_KPI_functions.py').read())
|
exec(open('utils_features_construction.py').read())
|
||||||
|
|
||||||
# Ignore warning
|
# Ignore warning
|
||||||
warnings.filterwarnings('ignore')
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
# functions
|
# functions
|
||||||
def generate_test_set(type_of_comp):
|
def generate_test_set(type_of_comp):
|
||||||
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Test_set")
|
file_path_list = fs.ls(f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/Test_set")
|
||||||
test_set = pd.DataFrame()
|
test_set = pd.DataFrame()
|
||||||
for file in file_path_list:
|
for file in file_path_list:
|
||||||
print(file)
|
print(file)
|
||||||
|
@ -32,7 +32,7 @@ def generate_test_set(type_of_comp):
|
||||||
|
|
||||||
|
|
||||||
def generate_train_set(type_of_comp):
|
def generate_train_set(type_of_comp):
|
||||||
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Train_set")
|
file_path_list = fs.ls(f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/Train_set")
|
||||||
train_set = pd.DataFrame()
|
train_set = pd.DataFrame()
|
||||||
for file in file_path_list:
|
for file in file_path_list:
|
||||||
print(file)
|
print(file)
|
||||||
|
@ -43,7 +43,7 @@ def generate_train_set(type_of_comp):
|
||||||
|
|
||||||
|
|
||||||
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/'
|
BUCKET_OUT = f'projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/'
|
||||||
|
|
||||||
# create test and train datasets
|
# create test and train datasets
|
||||||
test_set = generate_test_set(type_of_comp)
|
test_set = generate_test_set(type_of_comp)
|
82
4_Descriptive_Statistics.py
Normal file
82
4_Descriptive_Statistics.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import io
|
||||||
|
import s3fs
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
from datetime import date, timedelta, datetime
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.dates as mdates
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Ignore warning
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
exec(open('utils_features_construction.py').read())
|
||||||
|
exec(open('utils_stat_desc.py').read())
|
||||||
|
|
||||||
|
# Create filesystem object
|
||||||
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
||||||
|
'sport': ['5', '6', '7', '8', '9'],
|
||||||
|
'musique' : ['10', '11', '12', '13', '14']}
|
||||||
|
|
||||||
|
|
||||||
|
# type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
|
for type_of_activity in ['musee', 'sport', 'musique'] :
|
||||||
|
|
||||||
|
list_of_comp = companies[type_of_activity]
|
||||||
|
|
||||||
|
# Load files
|
||||||
|
customer, campaigns_kpi, campaigns_brut, tickets, products, targets = load_files(list_of_comp)
|
||||||
|
|
||||||
|
# Identify anonymous customer for each company and remove them from our datasets
|
||||||
|
outlier_list = outlier_detection(tickets, list_of_comp)
|
||||||
|
|
||||||
|
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
|
||||||
|
customer_valid_list = valid_customer_detection(products, campaigns_brut)
|
||||||
|
|
||||||
|
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
|
||||||
|
|
||||||
|
for dataset in databases:
|
||||||
|
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
|
||||||
|
dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
|
||||||
|
#print(f'shape of {dataset} : ', dataset.shape)
|
||||||
|
|
||||||
|
# Identify customer who bought during the period of y
|
||||||
|
customer_target_period = identify_purchase_during_target_periode(products)
|
||||||
|
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
|
||||||
|
|
||||||
|
# Generate graph and automatically saved them in the bucket
|
||||||
|
compute_nb_clients(customer, type_of_activity)
|
||||||
|
|
||||||
|
#maximum_price_paid(customer, type_of_activity)
|
||||||
|
|
||||||
|
target_proportion(customer, type_of_activity)
|
||||||
|
|
||||||
|
mailing_consent(customer, type_of_activity)
|
||||||
|
|
||||||
|
mailing_consent_by_target(customer, type_of_activity)
|
||||||
|
|
||||||
|
gender_bar(customer, type_of_activity)
|
||||||
|
|
||||||
|
country_bar(customer, type_of_activity)
|
||||||
|
|
||||||
|
lazy_customer_plot(campaigns_kpi, type_of_activity)
|
||||||
|
|
||||||
|
campaigns_effectiveness(customer, type_of_activity)
|
||||||
|
|
||||||
|
sale_dynamics(products, campaigns_brut, type_of_activity)
|
||||||
|
|
||||||
|
tickets_internet(tickets, type_of_activity)
|
||||||
|
|
||||||
|
already_bought_online(tickets, type_of_activity)
|
||||||
|
|
||||||
|
box_plot_price_tickets(tickets, type_of_activity)
|
||||||
|
|
||||||
|
target_description(targets, type_of_activity)
|
87
5_Modelling.py
Normal file
87
5_Modelling.py
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import io
|
||||||
|
import s3fs
|
||||||
|
import re
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
|
||||||
|
from sklearn.utils import class_weight
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.compose import ColumnTransformer
|
||||||
|
from sklearn.calibration import calibration_curve
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
||||||
|
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
|
||||||
|
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
|
||||||
|
import pickle
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
|
||||||
|
exec(open('utils_ml.py').read())
|
||||||
|
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=DataConversionWarning)
|
||||||
|
|
||||||
|
# choose the type of companies for which you want to run the pipeline
|
||||||
|
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
|
# choose the type of model
|
||||||
|
type_of_model = input('Choisissez le type de model : standard ? premium ?')
|
||||||
|
|
||||||
|
# load train and test set
|
||||||
|
# Create filesystem object
|
||||||
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
dataset_train, dataset_test = load_train_test(type_of_activity, type_of_model)
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
|
||||||
|
|
||||||
|
print("Shape train : ", X_train.shape)
|
||||||
|
print("Shape test : ", X_test.shape)
|
||||||
|
|
||||||
|
# processing
|
||||||
|
|
||||||
|
weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
|
||||||
|
y = y_train['y_has_purchased'])
|
||||||
|
|
||||||
|
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
|
||||||
|
|
||||||
|
preproc = preprocess(type_of_model, type_of_activity)
|
||||||
|
|
||||||
|
# Object for storing results
|
||||||
|
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
|
||||||
|
|
||||||
|
# Naive Bayes
|
||||||
|
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||||
|
print("Naive Bayes : Done")
|
||||||
|
|
||||||
|
# Logistic Regression
|
||||||
|
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
print("Logistic : Done")
|
||||||
|
|
||||||
|
model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||||
|
print("Logistic CV : Done")
|
||||||
|
|
||||||
|
# Random Forest
|
||||||
|
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||||
|
print("Random Forest : Done")
|
||||||
|
|
||||||
|
model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||||
|
print("Random Forest CV: Done")
|
||||||
|
|
||||||
|
# Save result
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
86
6_Segmentation_and_Marketing_Personae.py
Normal file
86
6_Segmentation_and_Marketing_Personae.py
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
|
||||||
|
# Packages
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import io
|
||||||
|
import s3fs
|
||||||
|
import re
|
||||||
|
import pickle
|
||||||
|
import warnings
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
###################################
|
||||||
|
|
||||||
|
# choose the model we use for the segmentation
|
||||||
|
# model_name = "LogisticRegression_Benchmark"
|
||||||
|
model_name = "LogisticRegression_cv"
|
||||||
|
|
||||||
|
###################################
|
||||||
|
|
||||||
|
|
||||||
|
# execute file including functions we need
|
||||||
|
exec(open('utils_segmentation.py').read())
|
||||||
|
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
# Create filesystem object
|
||||||
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
|
||||||
|
# choose the type of companies for which you want to run the pipeline
|
||||||
|
# type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
|
for type_of_activity in ['musee', 'sport', 'musique'] :
|
||||||
|
|
||||||
|
|
||||||
|
# load test set
|
||||||
|
dataset_test = load_test_file(type_of_activity)
|
||||||
|
|
||||||
|
# Load Model
|
||||||
|
model = load_model(type_of_activity, model_name)
|
||||||
|
|
||||||
|
|
||||||
|
### Preprocessing of data
|
||||||
|
X_test = dataset_test.drop(columns = 'y_has_purchased')
|
||||||
|
|
||||||
|
y_test = dataset_test[['y_has_purchased']]
|
||||||
|
|
||||||
|
X_test_segment = X_test
|
||||||
|
|
||||||
|
# add y_has_purchased to X_test
|
||||||
|
X_test_segment["has_purchased"] = y_test
|
||||||
|
|
||||||
|
# Add prediction and probability to dataset_test
|
||||||
|
y_pred = model.predict(X_test)
|
||||||
|
X_test_segment["has_purchased_estim"] = y_pred
|
||||||
|
|
||||||
|
y_pred_prob = model.predict_proba(X_test)[:, 1]
|
||||||
|
X_test_segment['score'] = y_pred_prob
|
||||||
|
|
||||||
|
X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
|
||||||
|
np.where(X_test_segment['score']<0.5, '2',
|
||||||
|
np.where(X_test_segment['score']<0.75, '3', '4')))
|
||||||
|
|
||||||
|
### 1. business KPIs
|
||||||
|
|
||||||
|
business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
|
||||||
|
X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
|
||||||
|
print(f"business figures for {type_of_activity} companies :\n")
|
||||||
|
print(X_test_business_fig)
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
# save histogram to Minio
|
||||||
|
hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",
|
||||||
|
"nb_purchases", "total_amount", "nb_campaigns", type_of_activity)
|
||||||
|
save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
### 2. description of marketing personae
|
||||||
|
## A. Spider chart
|
||||||
|
radar_mp_plot_all(df = X_test_segment, type_of_activity = type_of_activity)
|
||||||
|
save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity)
|
||||||
|
|
||||||
|
## B. Latex table
|
||||||
|
known_sociodemo_caracteristics(df = X_test_segment, type_of_activity = type_of_activity)
|
112
7_Sales_Forecast.py
Normal file
112
7_Sales_Forecast.py
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
# importations
|
||||||
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import s3fs
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy.optimize import fsolve
|
||||||
|
import pickle
|
||||||
|
import warnings
|
||||||
|
import io
|
||||||
|
|
||||||
|
|
||||||
|
# ignore warnings
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
# Create filesystem object
|
||||||
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
# importation of functions defined
|
||||||
|
exec(open('utils_sales_forecast.py').read())
|
||||||
|
# from utils_CA_segment import *
|
||||||
|
|
||||||
|
# define type of activity
|
||||||
|
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
|
PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/"
|
||||||
|
|
||||||
|
# type of model for the score
|
||||||
|
type_of_model = "LogisticRegression_cv"
|
||||||
|
# type_of_model = "LogisticRegression_Benchmark"
|
||||||
|
|
||||||
|
# load train and test sets
|
||||||
|
dataset_train, dataset_test = load_train_test(type_of_activity)
|
||||||
|
|
||||||
|
# make features - define X train and X test
|
||||||
|
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
|
||||||
|
|
||||||
|
# choose model - logit cross validated
|
||||||
|
model = load_model(type_of_activity, type_of_model)
|
||||||
|
|
||||||
|
# create table X test segment from X test
|
||||||
|
X_test_segment = df_segment(X_test, y_test, model)
|
||||||
|
|
||||||
|
# comparison with bias of the train set - X train to be defined
|
||||||
|
X_train_score = model.predict_proba(X_train)[:, 1]
|
||||||
|
|
||||||
|
bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),
|
||||||
|
y_objective = y_train["y_has_purchased"].sum(),
|
||||||
|
initial_guess=10)
|
||||||
|
print("Bias estimated :", np.log(bias_train_set))
|
||||||
|
|
||||||
|
# create a score adjusted with the bias computed
|
||||||
|
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
|
||||||
|
X_test_segment["score_adjusted"] = score_adjusted_train
|
||||||
|
|
||||||
|
print("The score was successfully adjusted")
|
||||||
|
MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()
|
||||||
|
MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()
|
||||||
|
print(f"MAE for score : {MAE_score}")
|
||||||
|
print(f"MAE for adjusted score : {MAE_ajusted_score}")
|
||||||
|
|
||||||
|
### 1. plot adjusted scores and save (to be tested)
|
||||||
|
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
|
||||||
|
save_file_s3_ca("hist_score_adjusted_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
### 2. comparison between score and adjusted score
|
||||||
|
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
|
||||||
|
X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
|
||||||
|
|
||||||
|
print("Table of scores :\n")
|
||||||
|
print(X_test_table_adjusted_scores)
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
# save table
|
||||||
|
file_name = "table_adjusted_score_"
|
||||||
|
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
|
X_test_table_adjusted_scores.to_csv(file_out, index = False)
|
||||||
|
|
||||||
|
|
||||||
|
# project revenue
|
||||||
|
X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted",
|
||||||
|
duration_ref=17, duration_projection=12)
|
||||||
|
|
||||||
|
|
||||||
|
### 3. table summarizing projections (nb tickets, revenue)
|
||||||
|
"""
|
||||||
|
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",
|
||||||
|
nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",
|
||||||
|
total_amount="total_amount", pace_purchase="pace_purchase"),2)
|
||||||
|
"""
|
||||||
|
|
||||||
|
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",
|
||||||
|
nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",
|
||||||
|
total_amount="total_amount_corrected", pace_purchase="pace_purchase"),2)
|
||||||
|
|
||||||
|
# rename columns
|
||||||
|
mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
|
||||||
|
X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)
|
||||||
|
|
||||||
|
print("Summary of forecast :\n")
|
||||||
|
print(X_test_expected_CA)
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
# save table
|
||||||
|
file_name = "table_expected_CA_"
|
||||||
|
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
|
X_test_expected_CA.to_csv(file_out, index = False)
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
8490
Notebook_AR.ipynb
8490
Notebook_AR.ipynb
File diff suppressed because one or more lines are too long
|
@ -1,825 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Business Data Challenge - Team 1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Configuration de l'accès aux données"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "0b029d42-fb02-481e-a407-7e41886198a6",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['bdc2324-data/1',\n",
|
|
||||||
" 'bdc2324-data/10',\n",
|
|
||||||
" 'bdc2324-data/101',\n",
|
|
||||||
" 'bdc2324-data/11',\n",
|
|
||||||
" 'bdc2324-data/12',\n",
|
|
||||||
" 'bdc2324-data/13',\n",
|
|
||||||
" 'bdc2324-data/14',\n",
|
|
||||||
" 'bdc2324-data/2',\n",
|
|
||||||
" 'bdc2324-data/3',\n",
|
|
||||||
" 'bdc2324-data/4',\n",
|
|
||||||
" 'bdc2324-data/5',\n",
|
|
||||||
" 'bdc2324-data/6',\n",
|
|
||||||
" 'bdc2324-data/7',\n",
|
|
||||||
" 'bdc2324-data/8',\n",
|
|
||||||
" 'bdc2324-data/9']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"# Create filesystem object\n",
|
|
||||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
||||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
|
||||||
"\n",
|
|
||||||
"BUCKET = \"bdc2324-data\"\n",
|
|
||||||
"fs.ls(BUCKET)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chargement des fichiers campaign_stats.csv\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Conversion des dates 'sent_at'\n",
|
|
||||||
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
|
|
||||||
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
|
|
||||||
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"2023-11-09 18:10:45+00:00\n",
|
|
||||||
"2020-06-02 08:24:08+00:00\n",
|
|
||||||
"2023-10-12 01:39:48+00:00\n",
|
|
||||||
"2023-10-10 17:06:29+00:00\n",
|
|
||||||
"2023-11-01 09:20:48+00:00\n",
|
|
||||||
"2021-03-31 14:59:02+00:00\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
|
|
||||||
"print(campaign_stats_1['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_1['sent_at'].min())\n",
|
|
||||||
"\n",
|
|
||||||
"print(campaign_stats_2['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_2['sent_at'].min())\n",
|
|
||||||
"\n",
|
|
||||||
"print(campaign_stats_3['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_3['sent_at'].min())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "c75632df-b018-4bb8-a99d-83f15af94369",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"0 2021-03-28 16:01:09+00:00\n",
|
|
||||||
"1 2021-03-28 16:01:09+00:00\n",
|
|
||||||
"2 2021-03-28 16:00:59+00:00\n",
|
|
||||||
"3 2021-03-28 16:00:59+00:00\n",
|
|
||||||
"4 2021-03-28 16:01:06+00:00\n",
|
|
||||||
" ... \n",
|
|
||||||
"6214803 2023-10-23 09:32:33+00:00\n",
|
|
||||||
"6214804 2023-10-23 09:32:49+00:00\n",
|
|
||||||
"6214805 2023-10-23 09:33:28+00:00\n",
|
|
||||||
"6214806 2023-10-23 09:31:53+00:00\n",
|
|
||||||
"6214807 2023-10-23 09:33:54+00:00\n",
|
|
||||||
"Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"campaign_stats_1['sent_at']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Customersplus.csv"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "d3bf880d-1065-4d5b-9954-1830aa5081af",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
||||||
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"id": "7368f381-db8e-4a4d-9fe2-5947eb55be58",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
|
|
||||||
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
|
|
||||||
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
|
|
||||||
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
|
|
||||||
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
|
|
||||||
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
|
|
||||||
" 'average_purchase_delay', 'average_price_basket',\n",
|
|
||||||
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
|
|
||||||
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
|
|
||||||
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
|
|
||||||
" 'tenant_id'],\n",
|
|
||||||
" dtype='object')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.columns"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "08091935-b159-47fa-806c-e1444f3b227e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.shape"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "9f8c8868-c1ac-4cee-af08-533d928f6764",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1['id'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "bf95daf2-4852-4718-b474-207a1ebd8ac4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_2['id'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "1425c385-3216-4e4f-ae8f-a121624721ba",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 61,
|
|
||||||
"id": "92533026-e27c-4f1f-81ca-64eda32a34c0",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 61,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
|
|
||||||
"# Exemple id commun = caractéristiques communes\n",
|
|
||||||
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
|
|
||||||
"\n",
|
|
||||||
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 49,
|
|
||||||
"id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7",
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"id 0.000000\n",
|
|
||||||
"lastname 43.461341\n",
|
|
||||||
"firstname 44.995588\n",
|
|
||||||
"birthdate 96.419870\n",
|
|
||||||
"email 8.622075\n",
|
|
||||||
"street_id 0.000000\n",
|
|
||||||
"created_at 0.000000\n",
|
|
||||||
"updated_at 0.000000\n",
|
|
||||||
"civility 100.000000\n",
|
|
||||||
"is_partner 0.000000\n",
|
|
||||||
"extra 100.000000\n",
|
|
||||||
"deleted_at 100.000000\n",
|
|
||||||
"reference 100.000000\n",
|
|
||||||
"gender 0.000000\n",
|
|
||||||
"is_email_true 0.000000\n",
|
|
||||||
"extra_field 100.000000\n",
|
|
||||||
"identifier 0.000000\n",
|
|
||||||
"opt_in 0.000000\n",
|
|
||||||
"structure_id 88.072380\n",
|
|
||||||
"note 99.403421\n",
|
|
||||||
"profession 95.913503\n",
|
|
||||||
"language 99.280945\n",
|
|
||||||
"mcp_contact_id 34.876141\n",
|
|
||||||
"need_reload 0.000000\n",
|
|
||||||
"last_buying_date 51.653431\n",
|
|
||||||
"max_price 51.653431\n",
|
|
||||||
"ticket_sum 0.000000\n",
|
|
||||||
"average_price 8.639195\n",
|
|
||||||
"fidelity 0.000000\n",
|
|
||||||
"average_purchase_delay 51.653431\n",
|
|
||||||
"average_price_basket 51.653431\n",
|
|
||||||
"average_ticket_basket 51.653431\n",
|
|
||||||
"total_price 43.014236\n",
|
|
||||||
"preferred_category 100.000000\n",
|
|
||||||
"preferred_supplier 100.000000\n",
|
|
||||||
"preferred_formula 100.000000\n",
|
|
||||||
"purchase_count 0.000000\n",
|
|
||||||
"first_buying_date 51.653431\n",
|
|
||||||
"last_visiting_date 100.000000\n",
|
|
||||||
"zipcode 71.176564\n",
|
|
||||||
"country 5.459418\n",
|
|
||||||
"age 96.419870\n",
|
|
||||||
"tenant_id 0.000000\n",
|
|
||||||
"dtype: float64\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"pd.DataFrame(customers_plus_1.isna().mean()*100)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 11,
|
|
||||||
"id": "6d62e73f-3925-490f-9fd4-d0e838903cb2",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chargement de toutes les données\n",
|
|
||||||
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
|
|
||||||
"\n",
|
|
||||||
"for nom_base in liste_base:\n",
|
|
||||||
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
|
|
||||||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>id</th>\n",
|
|
||||||
" <th>lastname</th>\n",
|
|
||||||
" <th>firstname</th>\n",
|
|
||||||
" <th>birthdate</th>\n",
|
|
||||||
" <th>email</th>\n",
|
|
||||||
" <th>street_id</th>\n",
|
|
||||||
" <th>created_at</th>\n",
|
|
||||||
" <th>updated_at</th>\n",
|
|
||||||
" <th>civility</th>\n",
|
|
||||||
" <th>is_partner</th>\n",
|
|
||||||
" <th>...</th>\n",
|
|
||||||
" <th>tenant_id</th>\n",
|
|
||||||
" <th>id_x</th>\n",
|
|
||||||
" <th>customer_id</th>\n",
|
|
||||||
" <th>purchase_date</th>\n",
|
|
||||||
" <th>type_of</th>\n",
|
|
||||||
" <th>is_from_subscription</th>\n",
|
|
||||||
" <th>amount</th>\n",
|
|
||||||
" <th>is_full_price</th>\n",
|
|
||||||
" <th>start_date_time</th>\n",
|
|
||||||
" <th>event_name</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>lastname405082</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>992423</td>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>2023-01-11 17:08:41+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>13.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-02-06 20:00:00+01:00</td>\n",
|
|
||||||
" <td>zaide</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>lastname405082</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>992423</td>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>2023-01-11 17:08:41+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>13.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-02-06 20:00:00+01:00</td>\n",
|
|
||||||
" <td>zaide</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>lastname411168</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1053934</td>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>2023-03-16 16:23:10+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>62.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-03-19 16:00:00+01:00</td>\n",
|
|
||||||
" <td>luisa miller</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>lastname411168</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1053934</td>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>2023-03-16 16:23:10+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>62.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-03-19 16:00:00+01:00</td>\n",
|
|
||||||
" <td>luisa miller</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>4380</td>\n",
|
|
||||||
" <td>lastname4380</td>\n",
|
|
||||||
" <td>firstname4380</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>2021-04-22 14:51:55.432952+02:00</td>\n",
|
|
||||||
" <td>2022-04-14 11:41:33.738500+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1189141</td>\n",
|
|
||||||
" <td>4380</td>\n",
|
|
||||||
" <td>2020-11-26 13:12:53+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>51.3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-12-01 20:00:00+01:00</td>\n",
|
|
||||||
" <td>iphigenie en tauride</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>...</th>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318964</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318965</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318966</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318967</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1244277</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-12-31 11:04:07+01:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>5.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-02-03 20:00:00+01:00</td>\n",
|
|
||||||
" <td>a boire et a manger</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318968</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1244277</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-12-31 11:04:07+01:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>5.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-02-03 20:00:00+01:00</td>\n",
|
|
||||||
" <td>a boire et a manger</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"<p>318969 rows × 52 columns</p>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" id lastname firstname birthdate email \\\n",
|
|
||||||
"0 405082 lastname405082 NaN NaN NaN \n",
|
|
||||||
"1 405082 lastname405082 NaN NaN NaN \n",
|
|
||||||
"2 411168 lastname411168 NaN NaN NaN \n",
|
|
||||||
"3 411168 lastname411168 NaN NaN NaN \n",
|
|
||||||
"4 4380 lastname4380 firstname4380 NaN NaN \n",
|
|
||||||
"... ... ... ... ... ... \n",
|
|
||||||
"318964 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318965 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318966 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318967 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318968 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"\n",
|
|
||||||
" street_id created_at \\\n",
|
|
||||||
"0 6 2023-01-12 06:30:31.197484+01:00 \n",
|
|
||||||
"1 6 2023-01-12 06:30:31.197484+01:00 \n",
|
|
||||||
"2 6 2023-03-17 06:30:35.431967+01:00 \n",
|
|
||||||
"3 6 2023-03-17 06:30:35.431967+01:00 \n",
|
|
||||||
"4 1 2021-04-22 14:51:55.432952+02:00 \n",
|
|
||||||
"... ... ... \n",
|
|
||||||
"318964 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318965 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318966 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318967 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318968 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"\n",
|
|
||||||
" updated_at civility is_partner ... \\\n",
|
|
||||||
"0 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
|
|
||||||
"1 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
|
|
||||||
"2 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
|
|
||||||
"3 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
|
|
||||||
"4 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
|
|
||||||
"... ... ... ... ... \n",
|
|
||||||
"318964 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318965 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318966 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318967 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318968 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"\n",
|
|
||||||
" tenant_id id_x customer_id purchase_date type_of \\\n",
|
|
||||||
"0 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
|
|
||||||
"1 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
|
|
||||||
"2 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
|
|
||||||
"3 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
|
|
||||||
"4 1556 1189141 4380 2020-11-26 13:12:53+01:00 3 \n",
|
|
||||||
"... ... ... ... ... ... \n",
|
|
||||||
"318964 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318965 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318966 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318967 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
|
|
||||||
"318968 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
|
|
||||||
"\n",
|
|
||||||
" is_from_subscription amount is_full_price start_date_time \\\n",
|
|
||||||
"0 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
|
|
||||||
"1 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
|
|
||||||
"2 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
|
|
||||||
"3 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
|
|
||||||
"4 False 51.3 False 2020-12-01 20:00:00+01:00 \n",
|
|
||||||
"... ... ... ... ... \n",
|
|
||||||
"318964 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318965 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318966 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318967 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
|
|
||||||
"318968 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
|
|
||||||
"\n",
|
|
||||||
" event_name \n",
|
|
||||||
"0 zaide \n",
|
|
||||||
"1 zaide \n",
|
|
||||||
"2 luisa miller \n",
|
|
||||||
"3 luisa miller \n",
|
|
||||||
"4 iphigenie en tauride \n",
|
|
||||||
"... ... \n",
|
|
||||||
"318964 entre femmes \n",
|
|
||||||
"318965 entre femmes \n",
|
|
||||||
"318966 entre femmes \n",
|
|
||||||
"318967 a boire et a manger \n",
|
|
||||||
"318968 a boire et a manger \n",
|
|
||||||
"\n",
|
|
||||||
"[318969 rows x 52 columns]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Jointure\n",
|
|
||||||
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
|
|
||||||
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
|
|
||||||
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
|
|
||||||
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
|
|
||||||
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
|
|
||||||
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
|
|
||||||
"df_customer_event"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
69
README.md
Normal file
69
README.md
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
# Business data challenge 2023-2024 | ENSAE Paris
|
||||||
|
|
||||||
|
|
||||||
|
# Arenametrix : customer segmentation
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="https://dev.arenametrix.fr/assets/logo_ax-806e8204f49bcc2c5e8cd34e9748d16a6038404e37fdb2dc9d61455bb06c6461.png" width=300>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Team 1
|
||||||
|
|
||||||
|
* Antoine JOUBREL
|
||||||
|
* Alexis REVELLE
|
||||||
|
* Fanta RODRIGUE
|
||||||
|
* Thomas PIQUÉ
|
||||||
|
|
||||||
|
|
||||||
|
## Coaches
|
||||||
|
|
||||||
|
* Elia LAPENTA
|
||||||
|
* Michael VISSER
|
||||||
|
|
||||||
|
## Support team
|
||||||
|
|
||||||
|
* Patrice MICHEL (Datastorm)
|
||||||
|
* Hassan MAISSORO (Datastorm)
|
||||||
|
* Alexandre PRINC (Arenametrix)
|
||||||
|
|
||||||
|
## Microeconomics coordinator
|
||||||
|
|
||||||
|
* Yuanzhe TANG
|
||||||
|
|
||||||
|
|
||||||
|
### Description of the problematic
|
||||||
|
The goal of this project is to create segments of customers from 15 companies belonging to 3 different types of activities (sports companies, museum, and music companies).
|
||||||
|
|
||||||
|
### More detailled instructions provided by Arenamtrix
|
||||||
|
- Definition of “marketing personae” that can be match with a probability to buy a future event
|
||||||
|
- Matching between future event and people in the database (with for instance a probability to buy a future event)
|
||||||
|
- And thus, a forecast of the quantity of ticket sold by event by “marketing personae” or by a segment of the database
|
||||||
|
- BONUS : What is the best timing to send a communication to each contact in the database and each “marketing personae”
|
||||||
|
- BONUS : What should we tell to each contact in the database and each “marketing personae”to make them come back
|
||||||
|
|
||||||
|
|
||||||
|
### Our approach
|
||||||
|
We opted for a sector-based approach, which means that 3 segmentations have been performed (one for each type of activity).
|
||||||
|
As the segments have to be linked to a probability of future purchase, we directly used the probability of purchase during the incoming year to make segments. The first step of the modelization is a pipeline that fits 3 ML models (naive bayes, random forest, and logistic regression) on the data to predict whether the customer will purchase during the year. We then use the probability of purchase estimated to split the customers into 4 segments. For each segment, we can estimate the potential number of tickets and revenue for the incoming year.
|
||||||
|
|
||||||
|
### How run the code
|
||||||
|
Codes have to be run in an order following their numbers. Each of them is described below :
|
||||||
|
|
||||||
|
- `1_Input_cleaning.py` \
|
||||||
|
Clean raw data and generate dataframes that will be used to build datasets with insightful variables. Datasets are exported to location 0_Input/.
|
||||||
|
- `2_Datasets_generation.py` \
|
||||||
|
Use dataframes previously created and aggregate them to create test and train set for each company. Databases are exported to location 1_Temp/1_0_Modelling_Datasets/ in a folder containing all 5 databases for a type of activity.
|
||||||
|
- `3_Modelling_datasets.py` \
|
||||||
|
For each type of activity, the test and train sets of the 5 tenants are concatenated. Databases are exported to location 1_Temp/1_0_Modelling_Datasets/.
|
||||||
|
- `4_Descriptive_statistics.py` \
|
||||||
|
Generate graphics providing some descriptive statistics about the data at the activity level. All graphics are exported to location 2_Output/2_0_Descriptive_Statistics/.
|
||||||
|
- `5_Modelling.py` \
|
||||||
|
3 ML models will be fitted on the data, and results will be exported for all 3 types of activities. \
|
||||||
|
3 pipelines are built, one by type of model (Naive Bayes, Random Forest, Logistic Regression). For the 2 latter ML methods, cross validation was performed to ensure generalization. Graphics displaying the quality of the training are provided. Optimal parameters found are saved in a pickle file (which will be used in the 6th step to add propensity scores to the test set and then determine the segments of the customers). All these files are exported to location 2_Output/2_1_Modeling_results/
|
||||||
|
- `6_Segmentation_and_Marketing_Personae.py` \
|
||||||
|
The test set will be fitted with the optimal parameters computed previously, and a propensity score (probability of a future purchase) will be assigned to each customer of this dataset. Segmentation is performed according to the scores provided. Graphics describing the marketing personae associated to the segments as well as their business value are exported to location 2_Output/2_2_Segmentation_and_Marketing_Personae/.
|
||||||
|
- `7_Sales_Forecast.py` \
|
||||||
|
To ensure a decent recall, and because of the unbalancing of the target variable y (the global probability of purchase is between 4 and 14 %), the probabilities of purchasing are overestimated.The scores will therefore be adjusted so that their mean approximates the overall probability of a purchase. This score adjusted is used to estimate, for each customer, the number of tickets sold and the revenue generated during the incoming year. Results are aggregated at segment level. A histogram displaying the adjusted propensity scores and 2 tables summarizing the forecast outcome are exported to location 2_Output/2_3_Sales_Forecast/.
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
249
all_packages_versions.txt
Normal file
249
all_packages_versions.txt
Normal file
|
@ -0,0 +1,249 @@
|
||||||
|
Package Version
|
||||||
|
------------------------- ---------------
|
||||||
|
aiohttp 3.9.1
|
||||||
|
aiosignal 1.3.1
|
||||||
|
alembic 1.13.1
|
||||||
|
anyio 4.2.0
|
||||||
|
archspec 0.2.2
|
||||||
|
argon2-cffi 23.1.0
|
||||||
|
argon2-cffi-bindings 21.2.0
|
||||||
|
arrow 1.3.0
|
||||||
|
astroid 3.0.2
|
||||||
|
asttokens 2.4.1
|
||||||
|
async-lru 2.0.4
|
||||||
|
attrs 23.2.0
|
||||||
|
Babel 2.14.0
|
||||||
|
bcrypt 4.1.2
|
||||||
|
beautifulsoup4 4.12.3
|
||||||
|
bleach 6.1.0
|
||||||
|
blinker 1.7.0
|
||||||
|
bokeh 3.3.4
|
||||||
|
boltons 23.1.1
|
||||||
|
boto3 1.34.29
|
||||||
|
botocore 1.34.29
|
||||||
|
branca 0.7.0
|
||||||
|
Brotli 1.1.0
|
||||||
|
cached-property 1.5.2
|
||||||
|
cachetools 5.3.2
|
||||||
|
certifi 2023.11.17
|
||||||
|
cffi 1.16.0
|
||||||
|
charset-normalizer 3.3.2
|
||||||
|
click 8.1.7
|
||||||
|
click-plugins 1.1.1
|
||||||
|
cligj 0.7.2
|
||||||
|
cloudpickle 3.0.0
|
||||||
|
colorama 0.4.6
|
||||||
|
comm 0.2.1
|
||||||
|
conda 23.11.0
|
||||||
|
conda-libmamba-solver 23.12.0
|
||||||
|
conda-package-handling 2.2.0
|
||||||
|
conda_package_streaming 0.9.0
|
||||||
|
configparser 5.3.0
|
||||||
|
contourpy 1.2.0
|
||||||
|
cryptography 41.0.7
|
||||||
|
cycler 0.12.1
|
||||||
|
cytoolz 0.12.2
|
||||||
|
dask 2024.1.1
|
||||||
|
databricks-cli 0.18.0
|
||||||
|
debugpy 1.8.0
|
||||||
|
decorator 5.1.1
|
||||||
|
defusedxml 0.7.1
|
||||||
|
dill 0.3.8
|
||||||
|
distributed 2024.1.1
|
||||||
|
distro 1.8.0
|
||||||
|
docker 7.0.0
|
||||||
|
duckdb 0.9.2
|
||||||
|
entrypoints 0.4
|
||||||
|
exceptiongroup 1.2.0
|
||||||
|
executing 2.0.1
|
||||||
|
fastjsonschema 2.19.1
|
||||||
|
fiona 1.9.5
|
||||||
|
flake8 7.0.0
|
||||||
|
Flask 3.0.1
|
||||||
|
folium 0.15.1
|
||||||
|
fonttools 4.47.2
|
||||||
|
fqdn 1.5.1
|
||||||
|
frozenlist 1.4.1
|
||||||
|
fsspec 2023.12.2
|
||||||
|
GDAL 3.8.3
|
||||||
|
gensim 4.3.2
|
||||||
|
geopandas 0.14.2
|
||||||
|
gitdb 4.0.11
|
||||||
|
GitPython 3.1.41
|
||||||
|
google-auth 2.27.0
|
||||||
|
greenlet 3.0.3
|
||||||
|
gunicorn 21.2.0
|
||||||
|
hvac 2.1.0
|
||||||
|
idna 3.6
|
||||||
|
importlib-metadata 7.0.1
|
||||||
|
importlib-resources 6.1.1
|
||||||
|
ipykernel 6.29.0
|
||||||
|
ipython 8.20.0
|
||||||
|
ipywidgets 8.1.1
|
||||||
|
isoduration 20.11.0
|
||||||
|
isort 5.13.2
|
||||||
|
itsdangerous 2.1.2
|
||||||
|
jedi 0.19.1
|
||||||
|
Jinja2 3.1.3
|
||||||
|
jmespath 1.0.1
|
||||||
|
joblib 1.3.2
|
||||||
|
json5 0.9.14
|
||||||
|
jsonpatch 1.33
|
||||||
|
jsonpointer 2.4
|
||||||
|
jsonschema 4.21.1
|
||||||
|
jsonschema-specifications 2023.12.1
|
||||||
|
jupyter-cache 1.0.0
|
||||||
|
jupyter_client 8.6.0
|
||||||
|
jupyter_core 5.7.1
|
||||||
|
jupyter-events 0.9.0
|
||||||
|
jupyter-lsp 2.2.2
|
||||||
|
jupyter_server 2.12.5
|
||||||
|
jupyter-server-mathjax 0.2.6
|
||||||
|
jupyter_server_terminals 0.5.2
|
||||||
|
jupyterlab 4.0.11
|
||||||
|
jupyterlab_git 0.50.0
|
||||||
|
jupyterlab_pygments 0.3.0
|
||||||
|
jupyterlab_server 2.25.2
|
||||||
|
jupyterlab-widgets 3.0.9
|
||||||
|
kiwisolver 1.4.5
|
||||||
|
kubernetes 29.0.0
|
||||||
|
libmambapy 1.5.5
|
||||||
|
llvmlite 0.41.1
|
||||||
|
locket 1.0.0
|
||||||
|
lz4 4.3.3
|
||||||
|
Mako 1.3.1
|
||||||
|
mamba 1.5.5
|
||||||
|
mapclassify 2.6.1
|
||||||
|
Markdown 3.5.2
|
||||||
|
MarkupSafe 2.1.4
|
||||||
|
matplotlib 3.8.2
|
||||||
|
matplotlib-inline 0.1.6
|
||||||
|
mccabe 0.7.0
|
||||||
|
menuinst 2.0.2
|
||||||
|
mistune 3.0.2
|
||||||
|
mlflow 2.10.0
|
||||||
|
msgpack 1.0.7
|
||||||
|
multidict 6.0.4
|
||||||
|
munkres 1.1.4
|
||||||
|
mypy 1.8.0
|
||||||
|
mypy-extensions 1.0.0
|
||||||
|
nbclient 0.8.0
|
||||||
|
nbconvert 7.14.2
|
||||||
|
nbdime 4.0.1
|
||||||
|
nbformat 5.9.2
|
||||||
|
nest_asyncio 1.6.0
|
||||||
|
networkx 3.2.1
|
||||||
|
nltk 3.8.1
|
||||||
|
notebook_shim 0.2.3
|
||||||
|
numba 0.58.1
|
||||||
|
numpy 1.26.3
|
||||||
|
oauthlib 3.2.2
|
||||||
|
opencv-python-headless 4.9.0.80
|
||||||
|
overrides 7.7.0
|
||||||
|
packaging 23.2
|
||||||
|
pandas 2.2.0
|
||||||
|
pandocfilters 1.5.0
|
||||||
|
paramiko 3.4.0
|
||||||
|
parso 0.8.3
|
||||||
|
partd 1.4.1
|
||||||
|
patsy 0.5.6
|
||||||
|
pexpect 4.9.0
|
||||||
|
pickleshare 0.7.5
|
||||||
|
pillow 10.2.0
|
||||||
|
pip 23.3.2
|
||||||
|
pkgutil_resolve_name 1.3.10
|
||||||
|
platformdirs 4.1.0
|
||||||
|
plotly 5.18.0
|
||||||
|
pluggy 1.3.0
|
||||||
|
polars 0.20.6
|
||||||
|
prometheus-client 0.19.0
|
||||||
|
prometheus-flask-exporter 0.23.0
|
||||||
|
prompt-toolkit 3.0.42
|
||||||
|
protobuf 4.24.4
|
||||||
|
psutil 5.9.8
|
||||||
|
ptyprocess 0.7.0
|
||||||
|
pure-eval 0.2.2
|
||||||
|
pyarrow 14.0.2
|
||||||
|
pyarrow-hotfix 0.6
|
||||||
|
pyasn1 0.5.1
|
||||||
|
pyasn1-modules 0.3.0
|
||||||
|
pycodestyle 2.11.1
|
||||||
|
pycosat 0.6.6
|
||||||
|
pycparser 2.21
|
||||||
|
pyflakes 3.2.0
|
||||||
|
Pygments 2.17.2
|
||||||
|
PyJWT 2.8.0
|
||||||
|
pylint 3.0.3
|
||||||
|
PyNaCl 1.5.0
|
||||||
|
pyOpenSSL 23.3.0
|
||||||
|
pyparsing 3.1.1
|
||||||
|
pyproj 3.6.1
|
||||||
|
PySocks 1.7.1
|
||||||
|
python-dateutil 2.8.2
|
||||||
|
python-json-logger 2.0.7
|
||||||
|
pytz 2023.3.post1
|
||||||
|
pyu2f 0.1.5
|
||||||
|
PyYAML 6.0.1
|
||||||
|
pyzmq 25.1.2
|
||||||
|
querystring-parser 1.2.4
|
||||||
|
referencing 0.32.1
|
||||||
|
regex 2023.12.25
|
||||||
|
requests 2.31.0
|
||||||
|
requests-oauthlib 1.3.1
|
||||||
|
rfc3339-validator 0.1.4
|
||||||
|
rfc3986-validator 0.1.1
|
||||||
|
rpds-py 0.17.1
|
||||||
|
rsa 4.9
|
||||||
|
Rtree 1.2.0
|
||||||
|
ruamel.yaml 0.18.5
|
||||||
|
ruamel.yaml.clib 0.2.7
|
||||||
|
s3fs 0.4.2
|
||||||
|
s3transfer 0.10.0
|
||||||
|
scikit-learn 1.4.0
|
||||||
|
scipy 1.12.0
|
||||||
|
seaborn 0.13.2
|
||||||
|
Send2Trash 1.8.2
|
||||||
|
setuptools 68.2.2
|
||||||
|
shapely 2.0.2
|
||||||
|
six 1.16.0
|
||||||
|
smart-open 6.4.0
|
||||||
|
smmap 5.0.0
|
||||||
|
sniffio 1.3.0
|
||||||
|
sortedcontainers 2.4.0
|
||||||
|
soupsieve 2.5
|
||||||
|
SQLAlchemy 2.0.25
|
||||||
|
sqlparse 0.4.4
|
||||||
|
stack-data 0.6.2
|
||||||
|
statsmodels 0.14.1
|
||||||
|
tabulate 0.9.0
|
||||||
|
tblib 3.0.0
|
||||||
|
tenacity 8.2.3
|
||||||
|
terminado 0.18.0
|
||||||
|
threadpoolctl 3.2.0
|
||||||
|
tinycss2 1.2.1
|
||||||
|
tomli 2.0.1
|
||||||
|
tomlkit 0.12.3
|
||||||
|
toolz 0.12.1
|
||||||
|
tornado 6.3.3
|
||||||
|
tqdm 4.66.1
|
||||||
|
traitlets 5.14.1
|
||||||
|
truststore 0.8.0
|
||||||
|
types-python-dateutil 2.8.19.20240106
|
||||||
|
typing_extensions 4.9.0
|
||||||
|
typing-utils 0.1.0
|
||||||
|
tzdata 2023.4
|
||||||
|
uri-template 1.3.0
|
||||||
|
urllib3 1.26.18
|
||||||
|
wcwidth 0.2.13
|
||||||
|
webcolors 1.13
|
||||||
|
webencodings 0.5.1
|
||||||
|
websocket-client 1.7.0
|
||||||
|
Werkzeug 3.0.1
|
||||||
|
wheel 0.42.0
|
||||||
|
widgetsnbextension 4.0.9
|
||||||
|
xgboost 2.0.3
|
||||||
|
xyzservices 2023.10.1
|
||||||
|
yarl 1.9.4
|
||||||
|
zict 3.0.0
|
||||||
|
zipp 3.17.0
|
||||||
|
zstandard 0.22.0
|
|
@ -1,460 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "bf34b03c-536f-4f93-93a5-e452552653aa",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdin",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Choisissez le type de compagnie : sport ? musique ? musee ? musique\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 10 : 2016-03-07 - 2023-09-25\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 11 : 2015-06-26 - 2023-11-08\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 12 : 2016-06-14 - 2023-11-08\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 13 : 2010-07-31 - 2023-11-08\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 14 : 1901-01-01 - 2023-11-08\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"FIN DE LA GENERATION DES DATASETS : SUCCESS\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Business Data Challenge - Team 1\n",
|
|
||||||
"\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"import re\n",
|
|
||||||
"import warnings\n",
|
|
||||||
"from datetime import date, timedelta, datetime\n",
|
|
||||||
"\n",
|
|
||||||
"# Create filesystem object\n",
|
|
||||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
||||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Import KPI construction functions\n",
|
|
||||||
"exec(open('0_KPI_functions.py').read())\n",
|
|
||||||
"\n",
|
|
||||||
"# Ignore warning\n",
|
|
||||||
"warnings.filterwarnings('ignore')\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def display_covering_time(df, company, datecover):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" This function draws the time coverage of each company\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
|
|
||||||
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
|
|
||||||
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
|
|
||||||
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
|
|
||||||
" return datecover\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def compute_time_intersection(datecover):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" This function returns the time coverage for all companies\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
|
|
||||||
" intersection = set.intersection(*timestamps_sets)\n",
|
|
||||||
" intersection_list = list(intersection)\n",
|
|
||||||
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
|
|
||||||
" return sorted(formated_dates)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def df_coverage_modelization(sport, coverage_train = 0.7):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" This function returns start_date, end_of_features and final dates\n",
|
|
||||||
" that help to construct train and test datasets\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" datecover = {}\n",
|
|
||||||
" for company in sport:\n",
|
|
||||||
" df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n",
|
|
||||||
" datetime_col = ['purchase_date'])\n",
|
|
||||||
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
|
|
||||||
" #print(datecover.keys())\n",
|
|
||||||
" dt_coverage = compute_time_intersection(datecover)\n",
|
|
||||||
" start_date = dt_coverage[0]\n",
|
|
||||||
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
|
|
||||||
" final_date = dt_coverage[-1]\n",
|
|
||||||
" return start_date, end_of_features, final_date\n",
|
|
||||||
" \n",
|
|
||||||
"\n",
|
|
||||||
"def dataset_construction(min_date, end_features_date, max_date, directory_path):\n",
|
|
||||||
" \n",
|
|
||||||
" # Import customerplus\n",
|
|
||||||
" df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
|
|
||||||
" df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
|
|
||||||
" df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
|
|
||||||
" \n",
|
|
||||||
" # Filtre de cohérence pour la mise en pratique de notre méthode\n",
|
|
||||||
" max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n",
|
|
||||||
" end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n",
|
|
||||||
" min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n",
|
|
||||||
"\n",
|
|
||||||
" #Filtre de la base df_campaigns_information\n",
|
|
||||||
" df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
|
|
||||||
" df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
|
|
||||||
" \n",
|
|
||||||
" #Filtre de la base df_products_purchased_reduced\n",
|
|
||||||
" df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
|
|
||||||
"\n",
|
|
||||||
" print(\"Data filtering : SUCCESS\")\n",
|
|
||||||
" \n",
|
|
||||||
" # Fusion de l'ensemble et creation des KPI\n",
|
|
||||||
"\n",
|
|
||||||
" # KPI sur les campagnes publicitaires\n",
|
|
||||||
" df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
|
|
||||||
"\n",
|
|
||||||
" # KPI sur le comportement d'achat\n",
|
|
||||||
" df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
|
|
||||||
"\n",
|
|
||||||
" # KPI sur les données socio-démographiques\n",
|
|
||||||
" df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n",
|
|
||||||
" \n",
|
|
||||||
" print(\"KPIs construction : SUCCESS\")\n",
|
|
||||||
" \n",
|
|
||||||
" # Fusion avec KPI liés au customer\n",
|
|
||||||
" df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n",
|
|
||||||
" \n",
|
|
||||||
" # Fill NaN values\n",
|
|
||||||
" df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n",
|
|
||||||
" \n",
|
|
||||||
" # Fusion avec KPI liés au comportement d'achat\n",
|
|
||||||
" df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n",
|
|
||||||
" \n",
|
|
||||||
" # Fill NaN values\n",
|
|
||||||
" df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n",
|
|
||||||
"\n",
|
|
||||||
" print(\"Explanatory variable construction : SUCCESS\")\n",
|
|
||||||
"\n",
|
|
||||||
" # 2. Construction of the explained variable \n",
|
|
||||||
" df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n",
|
|
||||||
"\n",
|
|
||||||
" # Indicatrice d'achat\n",
|
|
||||||
" df_products_purchased_to_predict['y_has_purchased'] = 1\n",
|
|
||||||
"\n",
|
|
||||||
" y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n",
|
|
||||||
"\n",
|
|
||||||
" print(\"Explained variable construction : SUCCESS\")\n",
|
|
||||||
" \n",
|
|
||||||
" # 3. Merge between explained and explanatory variables\n",
|
|
||||||
" dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n",
|
|
||||||
"\n",
|
|
||||||
" # 0 if there is no purchase\n",
|
|
||||||
" dataset[['y_has_purchased']].fillna(0)\n",
|
|
||||||
"\n",
|
|
||||||
" # add id_company prefix to customer_id\n",
|
|
||||||
" dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')\n",
|
|
||||||
" \n",
|
|
||||||
" return dataset\n",
|
|
||||||
"\n",
|
|
||||||
"## Exportation\n",
|
|
||||||
"\n",
|
|
||||||
"companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
|
|
||||||
" 'sport': ['5', '6', '7', '8', '9'],\n",
|
|
||||||
" 'musique' : ['10', '11', '12', '13', '14']}\n",
|
|
||||||
"\n",
|
|
||||||
"type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n",
|
|
||||||
"list_of_comp = companies[type_of_comp] \n",
|
|
||||||
"# Dossier d'exportation\n",
|
|
||||||
"BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'\n",
|
|
||||||
"\n",
|
|
||||||
"# Create test dataset and train dataset for sport companies\n",
|
|
||||||
"\n",
|
|
||||||
"start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n",
|
|
||||||
"\n",
|
|
||||||
"for company in list_of_comp:\n",
|
|
||||||
" dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
|
|
||||||
" max_date = final_date, directory_path = company) \n",
|
|
||||||
"\n",
|
|
||||||
" # Exportation\n",
|
|
||||||
" FILE_KEY_OUT_S3 = \"dataset_test\" + company + \".csv\"\n",
|
|
||||||
" FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Test_set/\" + FILE_KEY_OUT_S3\n",
|
|
||||||
" \n",
|
|
||||||
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
|
|
||||||
" dataset_test.to_csv(file_out, index = False)\n",
|
|
||||||
" \n",
|
|
||||||
" print(\"Exportation dataset test : SUCCESS\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Dataset train\n",
|
|
||||||
" dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
|
|
||||||
" max_date = final_date, directory_path = company)\n",
|
|
||||||
" # Export\n",
|
|
||||||
" FILE_KEY_OUT_S3 = \"dataset_train\" + company + \".csv\" \n",
|
|
||||||
" FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Train_test/\" + FILE_KEY_OUT_S3\n",
|
|
||||||
" \n",
|
|
||||||
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
|
|
||||||
" dataset_train.to_csv(file_out, index = False)\n",
|
|
||||||
" \n",
|
|
||||||
" print(\"Exportation dataset train : SUCCESS\")\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"FIN DE LA GENERATION DES DATASETS : SUCCESS\")\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "3721427e-5957-4556-b278-2e7ffca892f4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'projet-bdc2324-team1/Generalization/musique/Train_test/dataset_train14.csv'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"FILE_PATH_OUT_S3"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"id": "f8546992-f425-4d1e-ad75-ad26a8052a18",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'projet' is not defined",
|
|
||||||
"output_type": "error",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprojet\u001b[49m\u001b[38;5;241m-\u001b[39mbdc2324\u001b[38;5;241m-\u001b[39mteam1\u001b[38;5;241m/\u001b[39mGeneralization\u001b[38;5;241m/\u001b[39mmusique\u001b[38;5;241m/\u001b[39mTrain_test\n",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m: name 'projet' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"projet-bdc2324-team1/Generalization/musique/Train_test"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "0dd34710-6da2-4438-9e1d-0ac092c1d28c",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"(343126, 41)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"dataset_train.shape"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"id": "a3bfeeb6-2db0-4f1d-866c-8721343e97c5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"customer_id 0.000000\n",
|
|
||||||
"nb_tickets 0.000000\n",
|
|
||||||
"nb_purchases 0.000000\n",
|
|
||||||
"total_amount 0.000000\n",
|
|
||||||
"nb_suppliers 0.000000\n",
|
|
||||||
"vente_internet_max 0.000000\n",
|
|
||||||
"purchase_date_min 0.858950\n",
|
|
||||||
"purchase_date_max 0.858950\n",
|
|
||||||
"time_between_purchase 0.858950\n",
|
|
||||||
"nb_tickets_internet 0.000000\n",
|
|
||||||
"street_id 0.000000\n",
|
|
||||||
"structure_id 0.869838\n",
|
|
||||||
"mcp_contact_id 0.276677\n",
|
|
||||||
"fidelity 0.000000\n",
|
|
||||||
"tenant_id 0.000000\n",
|
|
||||||
"is_partner 0.000000\n",
|
|
||||||
"deleted_at 1.000000\n",
|
|
||||||
"gender 0.000000\n",
|
|
||||||
"is_email_true 0.000000\n",
|
|
||||||
"opt_in 0.000000\n",
|
|
||||||
"last_buying_date 0.709626\n",
|
|
||||||
"max_price 0.709626\n",
|
|
||||||
"ticket_sum 0.000000\n",
|
|
||||||
"average_price 0.709626\n",
|
|
||||||
"average_purchase_delay 0.709731\n",
|
|
||||||
"average_price_basket 0.709731\n",
|
|
||||||
"average_ticket_basket 0.709731\n",
|
|
||||||
"total_price 0.000000\n",
|
|
||||||
"purchase_count 0.000000\n",
|
|
||||||
"first_buying_date 0.709626\n",
|
|
||||||
"country 0.152090\n",
|
|
||||||
"gender_label 0.000000\n",
|
|
||||||
"gender_female 0.000000\n",
|
|
||||||
"gender_male 0.000000\n",
|
|
||||||
"gender_other 0.000000\n",
|
|
||||||
"country_fr 0.152090\n",
|
|
||||||
"has_tags 0.000000\n",
|
|
||||||
"nb_campaigns 0.000000\n",
|
|
||||||
"nb_campaigns_opened 0.000000\n",
|
|
||||||
"time_to_open 0.848079\n",
|
|
||||||
"y_has_purchased 1.000000\n",
|
|
||||||
"dtype: float64"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
" dataset_train.isna().sum()/dataset_train.shape[0]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "75f9a672-641f-49a2-a8d6-7673845506f5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#Creation de la variable dependante fictive: 1 si l'individu a effectué un achat au cours de la periode de train et 0 sinon\n",
|
|
||||||
"\n",
|
|
||||||
"dataset_train_modif=dataset_train\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "c121c1e2-d8e4-4b93-a882-9385581b63c9",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"dataset_train_modif[\""
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.6"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -74,7 +74,7 @@ def preprocessing_customerplus(directory_path):
|
||||||
cleaning_date(customerplus_copy, 'last_visiting_date')
|
cleaning_date(customerplus_copy, 'last_visiting_date')
|
||||||
|
|
||||||
# Selection des variables
|
# Selection des variables
|
||||||
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id' 'last_visiting_date', 'deleted_at'], axis = 1, inplace=True)
|
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
|
||||||
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
|
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
|
||||||
|
|
||||||
return customerplus_copy
|
return customerplus_copy
|
|
@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
|
||||||
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def campaigns_kpi_function(campaigns_information = None, max_date = None):
|
def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
|
||||||
|
|
||||||
# Nombre de campagnes de mails
|
# Nombre de campagnes de mails
|
||||||
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
||||||
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
||||||
|
|
||||||
# Temps d'ouverture moyen (en minutes)
|
# Temps d'ouverture moyen (en minutes)
|
||||||
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
|
campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
||||||
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
||||||
|
|
||||||
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
|
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
|
||||||
|
@ -44,7 +44,6 @@ def campaigns_kpi_function(campaigns_information = None, max_date = None):
|
||||||
|
|
||||||
return campaigns_reduced
|
return campaigns_reduced
|
||||||
|
|
||||||
|
|
||||||
def tickets_kpi_function(tickets_information = None):
|
def tickets_kpi_function(tickets_information = None):
|
||||||
|
|
||||||
tickets_information_copy = tickets_information.copy()
|
tickets_information_copy = tickets_information.copy()
|
||||||
|
@ -82,6 +81,11 @@ def tickets_kpi_function(tickets_information = None):
|
||||||
tickets_kpi['nb_purchases_internet'] = tickets_kpi['nb_purchases_internet'].fillna(0)
|
tickets_kpi['nb_purchases_internet'] = tickets_kpi['nb_purchases_internet'].fillna(0)
|
||||||
tickets_kpi['prop_purchases_internet'] = tickets_kpi['nb_purchases_internet'] / tickets_kpi['nb_purchases']
|
tickets_kpi['prop_purchases_internet'] = tickets_kpi['nb_purchases_internet'] / tickets_kpi['nb_purchases']
|
||||||
|
|
||||||
|
# Nombre d'achat à choisir
|
||||||
|
tickets_information_copy['month_year_purchase'] = 'purchases_' + tickets_information_copy['purchase_date'].dt.month.astype(str) + '_' + tickets_information_copy['purchase_date'].dt.year.astype(str)
|
||||||
|
purchases_by_month = tickets_information_copy.pivot_table(index='customer_id', columns='month_year_purchase', values='purchase_id', aggfunc='nunique', fill_value=0)
|
||||||
|
tickets_kpi = pd.merge(tickets_kpi, purchases_by_month, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
return tickets_kpi
|
return tickets_kpi
|
||||||
|
|
||||||
def customerplus_kpi_function(customerplus_clean = None):
|
def customerplus_kpi_function(customerplus_clean = None):
|
||||||
|
@ -93,16 +97,69 @@ def customerplus_kpi_function(customerplus_clean = None):
|
||||||
1: 'male',
|
1: 'male',
|
||||||
2: 'other'
|
2: 'other'
|
||||||
})
|
})
|
||||||
|
|
||||||
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
|
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
|
||||||
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
|
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
|
||||||
|
customerplus_clean.drop(columns = "gender", inplace = True)
|
||||||
|
|
||||||
customerplus_clean['opt_in'] = np.multiply(customersplus['opt_in'], 1)
|
|
||||||
## Indicatrice si individue vit en France
|
# Age
|
||||||
|
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
|
||||||
|
customerplus_clean['categorie_age_10_20'] = ((customerplus_clean['age'] >= 10) & (customerplus_clean['age'] < 20)).astype(int)
|
||||||
|
customerplus_clean['categorie_age_20_30'] = ((customerplus_clean['age'] >= 20) & (customerplus_clean['age'] < 30)).astype(int)
|
||||||
|
customerplus_clean['categorie_age_30_40'] = ((customerplus_clean['age'] >= 30) & (customerplus_clean['age'] < 40)).astype(int)
|
||||||
|
customerplus_clean['categorie_age_40_50'] = ((customerplus_clean['age'] >= 40) & (customerplus_clean['age'] < 50)).astype(int)
|
||||||
|
customerplus_clean['categorie_age_50_60'] = ((customerplus_clean['age'] >= 50) & (customerplus_clean['age'] < 60)).astype(int)
|
||||||
|
customerplus_clean['categorie_age_60_70'] = ((customerplus_clean['age'] >= 60) & (customerplus_clean['age'] < 70)).astype(int)
|
||||||
|
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
|
||||||
|
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
|
||||||
|
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
|
||||||
|
# customerplus_clean.drop(columns = "age", inplace = True)
|
||||||
|
|
||||||
|
# Consentement au mailing
|
||||||
|
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
|
||||||
|
|
||||||
|
# Indicatrice si individue vit en France
|
||||||
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
||||||
|
# customerplus_clean.drop(columns = "country", inplace = True)
|
||||||
|
|
||||||
|
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
|
||||||
|
# customerplus_clean.drop(columns = "profession", inplace = True)
|
||||||
|
|
||||||
# Dummy if the customer has a structure id (tags)
|
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
|
||||||
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
|
# customerplus_clean.drop(columns = "zipcode", inplace = True)
|
||||||
|
|
||||||
|
|
||||||
return customerplus_clean
|
return customerplus_clean
|
||||||
|
|
||||||
|
def targets_KPI(df_target = None):
|
||||||
|
|
||||||
|
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
|
||||||
|
|
||||||
|
# Target name cotegory musees /
|
||||||
|
df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
|
||||||
|
df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
|
||||||
|
df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
|
||||||
|
df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
|
||||||
|
df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
|
||||||
|
df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
|
||||||
|
df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
|
||||||
|
|
||||||
|
# Target name category for sport compagnies
|
||||||
|
df_target['target_abonne'] = ((
|
||||||
|
df_target['target_name']
|
||||||
|
.str.contains('|'.join(['abo', 'adh']), case=False)
|
||||||
|
& ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
|
||||||
|
).astype(int))
|
||||||
|
|
||||||
|
df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
|
||||||
|
|
||||||
|
target_agg = df_target.groupby('customer_id').agg(
|
||||||
|
nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes
|
||||||
|
# all_targets=('target_name', concatenate_names),
|
||||||
|
# all_target_types=('target_type_name', concatenate_names)
|
||||||
|
).reset_index()
|
||||||
|
|
||||||
|
target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
|
||||||
|
|
||||||
|
return target_agg
|
||||||
|
|
425
utils_ml.py
Normal file
425
utils_ml.py
Normal file
|
@ -0,0 +1,425 @@
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import s3fs
|
||||||
|
import re
|
||||||
|
import io
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
|
||||||
|
from sklearn.utils import class_weight
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.compose import ColumnTransformer
|
||||||
|
from sklearn.calibration import calibration_curve
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
||||||
|
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
|
||||||
|
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
|
||||||
|
def load_train_test(type_of_activity, type_of_model):
|
||||||
|
BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
|
||||||
|
File_path_train = BUCKET + "/Train_set.csv"
|
||||||
|
File_path_test = BUCKET + "/Test_set.csv"
|
||||||
|
|
||||||
|
with fs.open( File_path_train, mode="rb") as file_in:
|
||||||
|
dataset_train = pd.read_csv(file_in, sep=",")
|
||||||
|
# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
|
||||||
|
|
||||||
|
with fs.open(File_path_test, mode="rb") as file_in:
|
||||||
|
dataset_test = pd.read_csv(file_in, sep=",")
|
||||||
|
# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
|
||||||
|
|
||||||
|
if type_of_model=='premium':
|
||||||
|
dataset_train['company'] = dataset_train['customer_id'].apply(lambda x: x.split('_')[0])
|
||||||
|
dataset_test['company'] = dataset_test['customer_id'].apply(lambda x: x.split('_')[0])
|
||||||
|
dataset_train = dataset_train[dataset_train['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
|
||||||
|
dataset_test = dataset_test[dataset_test['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
|
||||||
|
return dataset_train, dataset_test
|
||||||
|
|
||||||
|
|
||||||
|
def save_file_s3(File_name, type_of_activity, type_of_model, model):
|
||||||
|
"""
|
||||||
|
save plot into s3 storage
|
||||||
|
"""
|
||||||
|
image_buffer = io.BytesIO()
|
||||||
|
plt.savefig(image_buffer, format='png')
|
||||||
|
image_buffer.seek(0)
|
||||||
|
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/"
|
||||||
|
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||||
|
s3_file.write(image_buffer.read())
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
|
||||||
|
"""
|
||||||
|
save result into s3 storage
|
||||||
|
"""
|
||||||
|
if model_path:
|
||||||
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
|
||||||
|
else:
|
||||||
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/" + File_name + '.csv'
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
|
result_set.to_csv(file_out, index = False)
|
||||||
|
|
||||||
|
|
||||||
|
def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
|
||||||
|
"""
|
||||||
|
save model into pickle file
|
||||||
|
"""
|
||||||
|
model_bytes = pickle.dumps(classifier)
|
||||||
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
|
||||||
|
f.write(model_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_recall(group):
|
||||||
|
return recall_score(group['y_has_purchased'], group['prediction'])
|
||||||
|
|
||||||
|
|
||||||
|
def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
|
||||||
|
test = dataset_test.copy()
|
||||||
|
test['prediction'] = y_pred
|
||||||
|
test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
|
||||||
|
recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
|
||||||
|
save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True)
|
||||||
|
|
||||||
|
|
||||||
|
def features_target_split(dataset_train, dataset_test):
|
||||||
|
"""
|
||||||
|
return train and test set
|
||||||
|
"""
|
||||||
|
features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
|
||||||
|
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
||||||
|
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
|
||||||
|
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',
|
||||||
|
'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
|
||||||
|
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
|
||||||
|
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',
|
||||||
|
'target_jeune', 'target_abonne']
|
||||||
|
X_train = dataset_train[features_l]
|
||||||
|
y_train = dataset_train[['y_has_purchased']]
|
||||||
|
|
||||||
|
X_test = dataset_test[features_l]
|
||||||
|
y_test = dataset_test[['y_has_purchased']]
|
||||||
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(type_of_model, type_of_activity):
|
||||||
|
"""
|
||||||
|
preprocess variables before running machine learning pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
|
||||||
|
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
||||||
|
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
|
||||||
|
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']
|
||||||
|
|
||||||
|
binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
|
||||||
|
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
|
||||||
|
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']
|
||||||
|
|
||||||
|
if type_of_activity=='musee':
|
||||||
|
numeric_features.remove('time_to_open')
|
||||||
|
|
||||||
|
if type_of_model=='premium':
|
||||||
|
if type_of_activity=='musique':
|
||||||
|
binary_features.extend(['target_optin', 'target_newsletter'])
|
||||||
|
elif type_of_activity=='sport':
|
||||||
|
binary_features.extend(['target_jeune', 'target_entreprise', 'target_abonne'])
|
||||||
|
else:
|
||||||
|
binary_features.extend([ 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter'])
|
||||||
|
|
||||||
|
|
||||||
|
numeric_transformer = Pipeline(steps=[
|
||||||
|
("imputer", SimpleImputer(strategy="constant", fill_value=0)),
|
||||||
|
("scaler", StandardScaler())
|
||||||
|
])
|
||||||
|
|
||||||
|
binary_transformer = Pipeline(steps=[
|
||||||
|
("imputer", SimpleImputer(strategy="most_frequent")),
|
||||||
|
])
|
||||||
|
preproc = ColumnTransformer(
|
||||||
|
transformers=[
|
||||||
|
("num", numeric_transformer, numeric_features),
|
||||||
|
("bin", binary_transformer, binary_features)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return preproc
|
||||||
|
|
||||||
|
|
||||||
|
def draw_confusion_matrix(y_test, y_pred, model):
|
||||||
|
conf_matrix = confusion_matrix(y_test, y_pred)
|
||||||
|
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'], annot_kws={"size": 14})
|
||||||
|
plt.xlabel('Predicted')
|
||||||
|
plt.ylabel('Actual')
|
||||||
|
plt.title('Confusion Matrix')
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_roc_curve(X_test, y_pred_prob, model):
|
||||||
|
# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
|
||||||
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
|
||||||
|
|
||||||
|
# Calcul de l'aire sous la courbe ROC (AUC)
|
||||||
|
roc_auc = auc(fpr, tpr)
|
||||||
|
|
||||||
|
plt.figure(figsize = (14, 8))
|
||||||
|
plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
|
||||||
|
plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
|
||||||
|
plt.grid(color='gray', linestyle='--', linewidth=0.5)
|
||||||
|
plt.xlabel("False Positive Rate", fontsize=14)
|
||||||
|
plt.ylabel("True Positive Rate", fontsize=14)
|
||||||
|
plt.title("ROC Curve", size=18)
|
||||||
|
plt.legend(loc="lower right", fontsize=14)
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("Roc_curve_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_calibration_curve(X_test, y_pred_prob, model):
|
||||||
|
frac_pos, mean_pred = calibration_curve(y_test, y_pred_prob, n_bins=10)
|
||||||
|
|
||||||
|
# Plot the calibration curve
|
||||||
|
plt.plot(mean_pred, frac_pos, 's-', label=model)
|
||||||
|
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
|
||||||
|
plt.xlabel('Mean predicted value')
|
||||||
|
plt.ylabel('Fraction of positive predictions')
|
||||||
|
plt.title("Calibration Curve")
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("Calib_curve_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_features_importance(pipeline, model, randomF = False):
|
||||||
|
if randomF:
|
||||||
|
coefficients = pipeline.named_steps[model].feature_importances_
|
||||||
|
else:
|
||||||
|
coefficients = pipeline.named_steps[model].coef_[0]
|
||||||
|
|
||||||
|
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
|
||||||
|
# Tracer l'importance des caractéristiques
|
||||||
|
plt.figure(figsize=(12, 8))
|
||||||
|
plt.barh(feature_names, coefficients, color='skyblue')
|
||||||
|
plt.xlabel("Features' Importance")
|
||||||
|
plt.ylabel('Caractéristiques')
|
||||||
|
plt.title("Features' Importance")
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("Features_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_prob_distribution(y_pred_prob, model):
|
||||||
|
plt.figure(figsize=(10, 8))
|
||||||
|
plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
|
||||||
|
|
||||||
|
plt.xlim(0, 1)
|
||||||
|
plt.ylim(0, None)
|
||||||
|
|
||||||
|
plt.title('Histogramme des probabilités pour la classe 1')
|
||||||
|
plt.xlabel('Probability')
|
||||||
|
plt.ylabel('Frequency')
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("prob_dist_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_prob_distribution_companies(y_pred_prob, model):
|
||||||
|
test = dataset_test.copy()
|
||||||
|
test['probability to buy'] = y_pred_prob
|
||||||
|
test['company'] = test['customer_id'].str.split('_', expand=True)[0]
|
||||||
|
sns.histplot(data=test, x='probability to buy', hue='company', element='step',
|
||||||
|
stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
|
||||||
|
plt.xlim(0, 1)
|
||||||
|
plt.ylim(0, None)
|
||||||
|
plt.title('Histogram of probabilities for class 1 by company')
|
||||||
|
plt.xlabel('Probability')
|
||||||
|
plt.ylabel('Frequency')
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('preprocessor', preproc),
|
||||||
|
('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
|
||||||
|
max_iter=5000, n_jobs=-1))
|
||||||
|
])
|
||||||
|
pipeline.fit(X_train, y_train)
|
||||||
|
|
||||||
|
y_pred = pipeline.predict(X_test)
|
||||||
|
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
||||||
|
|
||||||
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||||
|
model = "LogisticRegression_Benchmark"
|
||||||
|
result = pd.DataFrame({"Model" : [model],
|
||||||
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||||
|
"Recall" : [recall_score(y_test, y_pred)],
|
||||||
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||||
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
|
)
|
||||||
|
model_result = pd.concat([model_result, result])
|
||||||
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
|
||||||
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
|
draw_prob_distribution_companies(y_pred_prob, model)
|
||||||
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
|
save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||||
|
return model_result
|
||||||
|
|
||||||
|
|
||||||
|
def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
|
||||||
|
y_train = y_train['y_has_purchased']
|
||||||
|
param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
|
||||||
|
'LogisticRegression_cv__penalty': ['l1', 'l2'],
|
||||||
|
'LogisticRegression_cv__class_weight': ['balanced', weight_dict]}
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('preprocessor', preproc),
|
||||||
|
('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))
|
||||||
|
])
|
||||||
|
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
|
||||||
|
n_jobs=-1)
|
||||||
|
|
||||||
|
grid_search.fit(X_train, y_train)
|
||||||
|
y_pred = grid_search.predict(X_test)
|
||||||
|
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
|
||||||
|
best_pipeline = grid_search.best_estimator_
|
||||||
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||||
|
model = "LogisticRegression_cv"
|
||||||
|
result = pd.DataFrame({"Model" : [model],
|
||||||
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||||
|
"Recall" : [recall_score(y_test, y_pred)],
|
||||||
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||||
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
|
)
|
||||||
|
model_result = pd.concat([model_result, result])
|
||||||
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
draw_features_importance(best_pipeline, 'LogisticRegression_cv')
|
||||||
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
|
draw_prob_distribution_companies(y_pred_prob, model)
|
||||||
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
|
save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search)
|
||||||
|
return model_result
|
||||||
|
|
||||||
|
|
||||||
|
def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('preprocessor', preproc),
|
||||||
|
('randomF', RandomForestClassifier(class_weight = weight_dict,
|
||||||
|
n_jobs=-1))
|
||||||
|
])
|
||||||
|
pipeline.fit(X_train, y_train)
|
||||||
|
|
||||||
|
y_pred = pipeline.predict(X_test)
|
||||||
|
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
||||||
|
|
||||||
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||||
|
model = "randomF"
|
||||||
|
result = pd.DataFrame({"Model" : [model],
|
||||||
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||||
|
"Recall" : [recall_score(y_test, y_pred)],
|
||||||
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||||
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
|
)
|
||||||
|
model_result = pd.concat([model_result, result])
|
||||||
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
draw_features_importance(pipeline, 'randomF', randomF=True)
|
||||||
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
|
draw_prob_distribution_companies(y_pred_prob, model)
|
||||||
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
|
save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||||
|
return model_result
|
||||||
|
|
||||||
|
|
||||||
|
def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
|
||||||
|
y_train = y_train['y_has_purchased']
|
||||||
|
param_grid = {
|
||||||
|
'randomF_cv__n_estimators': [100, 300],
|
||||||
|
'randomF_cv__max_features': ['sqrt', 'log2'],
|
||||||
|
'randomF_cv__min_samples_split': [2, 10],
|
||||||
|
'randomF_cv__min_samples_leaf': [1, 4],
|
||||||
|
'randomF_cv__class_weight': [weight_dict]
|
||||||
|
}
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('preprocessor', preproc),
|
||||||
|
('randomF_cv', RandomForestClassifier(n_jobs=-1))
|
||||||
|
])
|
||||||
|
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
|
||||||
|
n_jobs=-1)
|
||||||
|
|
||||||
|
grid_search.fit(X_train, y_train)
|
||||||
|
y_pred = grid_search.predict(X_test)
|
||||||
|
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
|
||||||
|
best_pipeline = grid_search.best_estimator_
|
||||||
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||||
|
model = "randomF_cv"
|
||||||
|
result = pd.DataFrame({"Model" : [model],
|
||||||
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||||
|
"Recall" : [recall_score(y_test, y_pred)],
|
||||||
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||||
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
|
)
|
||||||
|
model_result = pd.concat([model_result, result])
|
||||||
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
|
||||||
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
|
draw_prob_distribution_companies(y_pred_prob, model)
|
||||||
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
|
save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search)
|
||||||
|
return model_result
|
||||||
|
|
||||||
|
|
||||||
|
def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||||
|
unique_classes, counts = np.unique(y_train, return_counts=True)
|
||||||
|
class_priors = counts / counts.sum()
|
||||||
|
pipeline = Pipeline(steps=[
|
||||||
|
('preprocessor', preproc),
|
||||||
|
('Naive_Bayes', GaussianNB(priors=class_priors))
|
||||||
|
])
|
||||||
|
pipeline.fit(X_train, y_train)
|
||||||
|
|
||||||
|
y_pred = pipeline.predict(X_test)
|
||||||
|
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
||||||
|
|
||||||
|
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||||
|
model = "Naive_Bayes"
|
||||||
|
result = pd.DataFrame({"Model" : [model],
|
||||||
|
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||||
|
"Recall" : [recall_score(y_test, y_pred)],
|
||||||
|
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||||
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
|
)
|
||||||
|
model_result = pd.concat([model_result, result])
|
||||||
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
|
save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||||
|
return model_result
|
325
utils_sales_forecast.py
Normal file
325
utils_sales_forecast.py
Normal file
|
@ -0,0 +1,325 @@
|
||||||
|
# importations
|
||||||
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import s3fs
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy.optimize import fsolve
|
||||||
|
import pickle
|
||||||
|
import warnings
|
||||||
|
import io
|
||||||
|
|
||||||
|
# functions
|
||||||
|
|
||||||
|
def load_train_test(type_of_activity):
|
||||||
|
"""
|
||||||
|
Loads the training and test datasets from S3 storage for the type of activity specified.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- type_of_activity (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Training dataset.
|
||||||
|
DataFrame: Test dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
|
||||||
|
BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
|
||||||
|
File_path_train = BUCKET + "/Train_set.csv"
|
||||||
|
File_path_test = BUCKET + "/Test_set.csv"
|
||||||
|
|
||||||
|
with fs.open( File_path_train, mode="rb") as file_in:
|
||||||
|
dataset_train = pd.read_csv(file_in, sep=",")
|
||||||
|
# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
|
||||||
|
|
||||||
|
with fs.open(File_path_test, mode="rb") as file_in:
|
||||||
|
dataset_test = pd.read_csv(file_in, sep=",")
|
||||||
|
# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
|
||||||
|
|
||||||
|
return dataset_train, dataset_test
|
||||||
|
|
||||||
|
|
||||||
|
def features_target_split(dataset_train, dataset_test):
|
||||||
|
"""
|
||||||
|
Splits the dataset into features and target variables for training and testing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- dataset_train (DataFrame): Training dataset.
|
||||||
|
- dataset_test (DataFrame): Test dataset.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Features of the training dataset.
|
||||||
|
DataFrame: Features of the test dataset.
|
||||||
|
DataFrame: Target variable of the training dataset.
|
||||||
|
DataFrame: Target variable of the test dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
||||||
|
'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
|
||||||
|
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
|
||||||
|
|
||||||
|
X_train = dataset_train # [features_l]
|
||||||
|
y_train = dataset_train[['y_has_purchased']]
|
||||||
|
|
||||||
|
X_test = dataset_test # [features_l]
|
||||||
|
y_test = dataset_test[['y_has_purchased']]
|
||||||
|
|
||||||
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
|
||||||
|
def load_model(type_of_activity, model):
|
||||||
|
"""
|
||||||
|
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- type_of_activity (str)
|
||||||
|
- model (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Model: machine learning model pre-trained with a scikit learn pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
|
||||||
|
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
|
||||||
|
filename = model + '.pkl'
|
||||||
|
file_path = BUCKET + filename
|
||||||
|
with fs.open(file_path, mode="rb") as f:
|
||||||
|
model_bytes = f.read()
|
||||||
|
|
||||||
|
model = pickle.loads(model_bytes)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def df_segment(df, y, model) :
|
||||||
|
"""
|
||||||
|
Segments customers into 4 groups based on the propensity scores given by a previously-loaded ML model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame): DataFrame to be segmented.
|
||||||
|
- y (Series): True target variable.
|
||||||
|
- model (Model): Pre-trained machine learning model for prediction.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Segmented DataFrame with predicted values and true values for y.
|
||||||
|
"""
|
||||||
|
|
||||||
|
y_pred = model.predict(df)
|
||||||
|
y_pred_prob = model.predict_proba(df)[:, 1]
|
||||||
|
|
||||||
|
df_segment = df
|
||||||
|
|
||||||
|
df_segment["has_purchased"] = y
|
||||||
|
df_segment["has_purchased_estim"] = y_pred
|
||||||
|
df_segment["score"] = y_pred_prob
|
||||||
|
df_segment["quartile"] = np.where(df_segment['score']<0.25, '1',
|
||||||
|
np.where(df_segment['score']<0.5, '2',
|
||||||
|
np.where(df_segment['score']<0.75, '3', '4')))
|
||||||
|
|
||||||
|
return df_segment
|
||||||
|
|
||||||
|
|
||||||
|
def odd_ratio(score) :
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
- score (Union[float, int])
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Odd ratio value.
|
||||||
|
"""
|
||||||
|
|
||||||
|
return score / (1 - score)
|
||||||
|
|
||||||
|
|
||||||
|
def adjust_score_1(score) :
|
||||||
|
"""
|
||||||
|
Adjust scores by replacing ones with the second highest value.
|
||||||
|
Allows to compute odd ratios then.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- score (List[Union[float, int]])
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Adjusted score values.
|
||||||
|
"""
|
||||||
|
|
||||||
|
second_best_score = np.array([element for element in score if element !=1]).max()
|
||||||
|
new_score = np.array([element if element!=1 else second_best_score for element in score])
|
||||||
|
return new_score
|
||||||
|
|
||||||
|
|
||||||
|
def adjusted_score(odd_ratio, bias) :
|
||||||
|
"""
|
||||||
|
Adjust the score based on the odd ratio and bias.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- odd_ratio (Union[float, int])
|
||||||
|
- bias (Union[float, int])
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Adjusted score value.
|
||||||
|
"""
|
||||||
|
|
||||||
|
adjusted_score = odd_ratio/(bias+odd_ratio)
|
||||||
|
return adjusted_score
|
||||||
|
|
||||||
|
|
||||||
|
def find_bias(odd_ratios, y_objective, initial_guess=10) :
|
||||||
|
"""
|
||||||
|
Find the bias needed to adjust scores so that their sum is equal to the total number of purchases observed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- odd_ratios (List[float]): List of odd ratios associated to the scores that have be adjusted.
|
||||||
|
- y_objective (Union[float, int]): Objective value => total number of purchases.
|
||||||
|
- initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 10 (bias is approximately 6 for sports, 10 for music and 22 for museums)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Estimated bias value.
|
||||||
|
"""
|
||||||
|
|
||||||
|
bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess)
|
||||||
|
|
||||||
|
return bias_estimated[0]
|
||||||
|
|
||||||
|
|
||||||
|
def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
|
||||||
|
"""
|
||||||
|
Plot a histogram comparing scores and adjusted scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame): DataFrame containing score data.
|
||||||
|
- score (str): Name of the column in df representing the original scores.
|
||||||
|
- score_adjusted (str): Name of the column in df representing the adjusted scores.
|
||||||
|
- type_of_activity (str) : type of activity of the companies considered.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
plt.hist(df[score], label = "score", alpha=0.6)
|
||||||
|
plt.hist(df[score_adjusted], label="adjusted score", alpha=0.6)
|
||||||
|
plt.legend()
|
||||||
|
plt.xlabel("probability of a future purchase")
|
||||||
|
plt.ylabel("count")
|
||||||
|
plt.title(f"Comparison between score and adjusted score for {type_of_activity} companies")
|
||||||
|
# plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
|
||||||
|
"""
|
||||||
|
Project tickets sold and total amount based on the adjusted scores and the duration of periods of study / projection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame): DataFrame containing information about past sales.
|
||||||
|
- nb_purchases (str) : Name of the column in df representing the number of purchases.
|
||||||
|
- nb_tickets (str): Name of the column in df representing the number of tickets.
|
||||||
|
- total_amount (str): Name of the column in df representing the total amount.
|
||||||
|
- score_adjusted (str): Name of the column in df representing the adjusted score.
|
||||||
|
- duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
|
||||||
|
- duration_projection (int or float): Duration of the period of projection of sales / revenue.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: DataFrame completed with sales and total amount projections.
|
||||||
|
"""
|
||||||
|
|
||||||
|
duration_ratio = duration_ref/duration_projection
|
||||||
|
|
||||||
|
df_output = df
|
||||||
|
|
||||||
|
# project number of tickets : at least 1 ticket purchased if the customer purchased
|
||||||
|
df_output.loc[:,"nb_tickets_projected"] = df_output.loc[:,nb_tickets].apply(lambda x : max(1, x /duration_ratio))
|
||||||
|
|
||||||
|
# project amount : if the customer buys a ticket, we expect the amount to be at least the average price of tickets
|
||||||
|
# for customers purchasing exactly one ticket
|
||||||
|
if df_output.loc[df_output[nb_tickets]==1].shape[0] > 0 :
|
||||||
|
avg_price = df_output.loc[df_output[nb_tickets]==1][total_amount].mean()
|
||||||
|
else :
|
||||||
|
avg_price = df_output[total_amount].mean()
|
||||||
|
|
||||||
|
# we compute the avg price of ticket for each customer
|
||||||
|
df_output["avg_ticket_price"] = df_output[total_amount]/df_output[nb_tickets]
|
||||||
|
|
||||||
|
# correct negatives total amounts
|
||||||
|
df_output.loc[:,"total_amount_corrected"] = np.where(df_output[total_amount] < 0,
|
||||||
|
avg_price * df_output[nb_tickets],
|
||||||
|
df_output[total_amount])
|
||||||
|
|
||||||
|
df_output.loc[:,"total_amount_projected"] = np.where(
|
||||||
|
# if no ticket bought in the past, we take the average price
|
||||||
|
df_output[nb_tickets]==0, avg_price,
|
||||||
|
# if avg prices of tickets are negative, we recompute the expected amount based on the avg price of a ticket
|
||||||
|
# observed on the whole population
|
||||||
|
np.where(X_test_segment["avg_ticket_price"] < 0, avg_price * df_output.loc[:,"nb_tickets_projected"],
|
||||||
|
# else, the amount projected is the average price of tickets bought by the customer * nb tickets projected
|
||||||
|
df_output["avg_ticket_price"] * df_output.loc[:,"nb_tickets_projected"])
|
||||||
|
)
|
||||||
|
|
||||||
|
df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"]
|
||||||
|
df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"]
|
||||||
|
|
||||||
|
df_output.loc[:,"pace_purchase"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)
|
||||||
|
|
||||||
|
return df_output
|
||||||
|
|
||||||
|
|
||||||
|
def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
|
||||||
|
duration_ref=17, duration_projection=12) :
|
||||||
|
"""
|
||||||
|
Generate a summary of expected customer sales based on segments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame): DataFrame containing customer data.
|
||||||
|
- segment (str): Name of the column in df representing customer segments.
|
||||||
|
- nb_tickets_expected (str): Name of the column in df representing the expected number of tickets.
|
||||||
|
- total_amount_expected (str): Name of the column in df representing the expected total amount.
|
||||||
|
- total_amount (str): Name of the column in df representing the total amount.
|
||||||
|
- pace_purchase (str) : Name of the column in df representing the average time between 2 purchases in months.
|
||||||
|
- duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
|
||||||
|
- duration_projection (int or float): Duration of the period of projection of sales / revenue.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Summary DataFrame containing expected customer sales metrics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# compute nb tickets estimated and total amount expected
|
||||||
|
df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()
|
||||||
|
|
||||||
|
# number of customers by segment
|
||||||
|
df_expected_CA.insert(1, "size", df.groupby(segment).size().values)
|
||||||
|
|
||||||
|
# size in percent of all customers
|
||||||
|
df_expected_CA.insert(2, "size_perct", 100 * df_expected_CA["size"]/df_expected_CA["size"].sum())
|
||||||
|
|
||||||
|
# compute share of CA recovered
|
||||||
|
duration_ratio=duration_ref/duration_projection
|
||||||
|
|
||||||
|
df_expected_CA["revenue_recovered_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
|
||||||
|
df.groupby(segment)[total_amount].sum().values
|
||||||
|
|
||||||
|
df_expected_CA["share_future_revenue_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
|
||||||
|
df[total_amount].sum()
|
||||||
|
|
||||||
|
df_drop_null_pace = df.dropna(subset=[pace_purchase])
|
||||||
|
df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
|
||||||
|
|
||||||
|
return df_expected_CA
|
||||||
|
|
||||||
|
|
||||||
|
def save_file_s3_ca(File_name, type_of_activity):
|
||||||
|
"""
|
||||||
|
Saves a file in S3 storage.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- File_name (str)
|
||||||
|
- type_of_activity (str)
|
||||||
|
"""
|
||||||
|
|
||||||
|
image_buffer = io.BytesIO()
|
||||||
|
plt.savefig(image_buffer, format='png', dpi=120)
|
||||||
|
image_buffer.seek(0)
|
||||||
|
PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/"
|
||||||
|
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||||
|
s3_file.write(image_buffer.read())
|
||||||
|
plt.close()
|
335
utils_segmentation.py
Normal file
335
utils_segmentation.py
Normal file
|
@ -0,0 +1,335 @@
|
||||||
|
# functions for segmentation and graphics associated
|
||||||
|
|
||||||
|
def load_model(type_of_activity, model):
|
||||||
|
"""
|
||||||
|
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- type_of_activity (str)
|
||||||
|
- model (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Model: machine learning model pre-trained with a scikit learn pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
|
||||||
|
filename = model + '.pkl'
|
||||||
|
file_path = BUCKET + filename
|
||||||
|
with fs.open(file_path, mode="rb") as f:
|
||||||
|
model_bytes = f.read()
|
||||||
|
|
||||||
|
model = pickle.loads(model_bytes)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def load_test_file(type_of_activity):
|
||||||
|
"""
|
||||||
|
Load the test dataset from S3 storage for the type of activity specified.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- type_of_activity (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: Test dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
|
||||||
|
with fs.open(file_path_test, mode="rb") as file_in:
|
||||||
|
dataset_test = pd.read_csv(file_in, sep=",")
|
||||||
|
return dataset_test
|
||||||
|
|
||||||
|
|
||||||
|
def save_file_s3_mp(File_name, type_of_activity):
|
||||||
|
"""
|
||||||
|
Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- File_name (str)
|
||||||
|
- type_of_activity (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
|
||||||
|
image_buffer = io.BytesIO()
|
||||||
|
plt.savefig(image_buffer, format='png', dpi=110)
|
||||||
|
image_buffer.seek(0)
|
||||||
|
PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
|
||||||
|
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||||
|
s3_file.write(image_buffer.read())
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
def save_txt_file_s3(file_name, type_of_activity, content):
|
||||||
|
"""
|
||||||
|
Save a text file to S3 storage to the location assigned for the type of activity specified.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- file_name (str)
|
||||||
|
- type_of_activity (str)
|
||||||
|
- content (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
|
||||||
|
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
|
||||||
|
FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
|
||||||
|
s3_file.write(content)
|
||||||
|
|
||||||
|
def df_business_fig(df, segment, list_var) :
|
||||||
|
"""
|
||||||
|
Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame): The DataFrame containing data.
|
||||||
|
- segment (str): The column name representing segments.
|
||||||
|
- list_var (list of str): The list of variable names to be aggregated.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame: The DataFrame containing business KPIs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
|
||||||
|
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
|
||||||
|
all_var = ["size"] + list_var
|
||||||
|
df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
|
||||||
|
|
||||||
|
return df_business_kpi
|
||||||
|
|
||||||
|
|
||||||
|
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
|
||||||
|
"""
|
||||||
|
Plot a histogram stacking the relative weight of each segment regarding some key business indicators.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators
|
||||||
|
- segment (str): The column name representing segments.
|
||||||
|
- size (str): The column name representing the size.
|
||||||
|
- nb_tickets (str): The column name representing the number of tickets.
|
||||||
|
- nb_purchases (str): The column name representing the number of purchases.
|
||||||
|
- total_amount (str): The column name representing the total amount.
|
||||||
|
- nb_campaigns (str): The column name representing the number of campaigns.
|
||||||
|
- type_of_activity (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
|
||||||
|
df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
|
||||||
|
|
||||||
|
x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
|
||||||
|
"number of\ncampaigns"]
|
||||||
|
|
||||||
|
bottom = np.zeros(5)
|
||||||
|
|
||||||
|
# types of blue color
|
||||||
|
colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
|
||||||
|
|
||||||
|
for i in range(4) :
|
||||||
|
height = list(df_plot.loc[i,size:].values)
|
||||||
|
plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
|
||||||
|
bottom+=height
|
||||||
|
|
||||||
|
# Ajust margins
|
||||||
|
plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
|
||||||
|
|
||||||
|
plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
|
||||||
|
plt.ylabel("Fraction represented by the segment (%)")
|
||||||
|
plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
|
||||||
|
# plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
# def df_segment_mp(df) :
|
||||||
|
# df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
|
||||||
|
# df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
|
||||||
|
# df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
|
||||||
|
# return df_mp
|
||||||
|
|
||||||
|
|
||||||
|
# def df_segment_pb (df) :
|
||||||
|
# df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
|
||||||
|
# return df_pb
|
||||||
|
|
||||||
|
|
||||||
|
def radar_mp_plot(df, categories, index) :
|
||||||
|
"""
|
||||||
|
Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment
|
||||||
|
- categories (list of str):
|
||||||
|
- index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
|
||||||
|
categories = categories
|
||||||
|
|
||||||
|
# true values are used to print the true value in parenthesis
|
||||||
|
tvalues = list(df.loc[index,categories])
|
||||||
|
|
||||||
|
max_values = df[categories].max()
|
||||||
|
|
||||||
|
# values are true values / max among the 4 segments, allows to
|
||||||
|
# put values in relation with the values for other segments
|
||||||
|
# if the point has a maximal abscisse it means that value is maximal for the segment considered
|
||||||
|
# , event if not equal to 1
|
||||||
|
|
||||||
|
values = list(df.loc[index,categories]/max_values)
|
||||||
|
|
||||||
|
# values normalized are used to adjust the value around the circle
|
||||||
|
# for instance if the maximum of values is equal to 0.8, we want the point to be
|
||||||
|
# at 8/10th of the circle radius, not at the edge
|
||||||
|
values_normalized = [ max(values) * elt for elt in values]
|
||||||
|
|
||||||
|
# Nb of categories
|
||||||
|
num_categories = len(categories)
|
||||||
|
|
||||||
|
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
|
||||||
|
|
||||||
|
# Initialize graphic
|
||||||
|
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
|
||||||
|
|
||||||
|
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
|
||||||
|
# which is based on max(value)
|
||||||
|
# if we don't plot this transparent line, the radius of the circle will be too small
|
||||||
|
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
|
||||||
|
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
|
||||||
|
|
||||||
|
# fill the sector
|
||||||
|
ax.fill(angles, values_normalized, color='orange', alpha=0.4)
|
||||||
|
|
||||||
|
# labels
|
||||||
|
ax.set_yticklabels([])
|
||||||
|
ax.set_xticks(angles)
|
||||||
|
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
|
||||||
|
ax.set_xticklabels(ticks, color="black")
|
||||||
|
|
||||||
|
ax.spines['polar'].set_visible(False)
|
||||||
|
|
||||||
|
plt.title(f'Characteristics of the segment {index+1}\n')
|
||||||
|
|
||||||
|
# plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def radar_mp_plot_all(df, type_of_activity) :
|
||||||
|
"""
|
||||||
|
Plot exactly the same radar charts as radar_mp_plot, but for all segments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame)
|
||||||
|
- type_of_activity (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
|
||||||
|
# table summarizing variables relative to marketing personae
|
||||||
|
df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
|
||||||
|
#df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
|
||||||
|
df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["gender_female"]+df_mp["gender_male"]))
|
||||||
|
|
||||||
|
# table relative to purchasing behaviour
|
||||||
|
df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
|
||||||
|
|
||||||
|
# concatenation of tables to prepare the plot
|
||||||
|
df_used = pd.concat([df_pb, df_mp[[ 'share_of_women', 'age']]], axis=1)
|
||||||
|
|
||||||
|
# rename columns for the plot
|
||||||
|
df_used = df_used.rename(columns={'taux_ouverture_mail': 'mails_opened', 'prop_purchases_internet': 'purchases_internet'})
|
||||||
|
|
||||||
|
# visualization
|
||||||
|
nb_segments = df_used.shape[0]
|
||||||
|
categories = list(df_used.drop("segment", axis=1).columns)
|
||||||
|
|
||||||
|
var_not_perc = ["age"]
|
||||||
|
|
||||||
|
# Initialize graphic
|
||||||
|
fig, ax = plt.subplots(2,2, figsize=(20, 21), subplot_kw=dict(polar=True))
|
||||||
|
|
||||||
|
for index in range(nb_segments) :
|
||||||
|
row = index // 2 # Division entière pour obtenir le numéro de ligne
|
||||||
|
col = index % 2
|
||||||
|
|
||||||
|
# true values are used to print the true value in parenthesis
|
||||||
|
tvalues = list(df_used.loc[index,categories])
|
||||||
|
|
||||||
|
max_values = df_used[categories].max()
|
||||||
|
|
||||||
|
# values are true values / max among the 4 segments, allows to
|
||||||
|
# put values in relation with the values for other segments
|
||||||
|
# if the point has a maximal abscisse it means that value is maximal for the segment considered
|
||||||
|
# , event if not equal to 1
|
||||||
|
|
||||||
|
values = list(df_used.loc[index,categories]/max_values)
|
||||||
|
|
||||||
|
# values normalized are used to adjust the value around the circle
|
||||||
|
# for instance if the maximum of values is equal to 0.8, we want the point to be
|
||||||
|
# at 8/10th of the circle radius, not at the edge
|
||||||
|
values_normalized = [ max(values) * elt for elt in values]
|
||||||
|
|
||||||
|
# Nb of categories
|
||||||
|
num_categories = len(categories)
|
||||||
|
|
||||||
|
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
|
||||||
|
|
||||||
|
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
|
||||||
|
# which is based on max(value)
|
||||||
|
# if we don't plot this transparent line, the radius of the circle will be too small
|
||||||
|
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
|
||||||
|
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
|
||||||
|
linewidth=1.2)
|
||||||
|
|
||||||
|
# fill the sector
|
||||||
|
ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
|
||||||
|
|
||||||
|
# labels
|
||||||
|
ax[row, col].set_yticklabels([])
|
||||||
|
ax[row, col].set_xticks(angles)
|
||||||
|
|
||||||
|
# define the ticks
|
||||||
|
values_printed = [str(round(tvalues[i],2)) if categories[i] in var_not_perc else f"{round(100 * tvalues[i],2)}%" for i in range(len(categories))]
|
||||||
|
ticks = [categories[i].replace("_"," ") + f"\n({values_printed[i]})" for i in range(len(categories))]
|
||||||
|
ax[row, col].set_xticklabels(ticks, color="black", size = 20)
|
||||||
|
|
||||||
|
ax[row, col].spines['polar'].set_visible(False)
|
||||||
|
|
||||||
|
ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
|
||||||
|
|
||||||
|
fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
# plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
def known_sociodemo_caracteristics(df, type_of_activity) :
|
||||||
|
"""
|
||||||
|
Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
- df (DataFrame)
|
||||||
|
- type_of_activity (str)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
|
||||||
|
table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
|
||||||
|
table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']
|
||||||
|
table_share_known= table_share_known.pivot_table(index=None, columns='Segment')
|
||||||
|
|
||||||
|
# Arrondir les valeurs du DataFrame à une décimale
|
||||||
|
table_share_known_rounded = table_share_known.round(1)
|
||||||
|
|
||||||
|
# Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'
|
||||||
|
latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f")
|
||||||
|
latex_table = latex_table.replace('%', '\\%')
|
||||||
|
|
||||||
|
save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table)
|
||||||
|
|
||||||
|
|
467
utils_stat_desc.py
Normal file
467
utils_stat_desc.py
Normal file
|
@ -0,0 +1,467 @@
|
||||||
|
|
||||||
|
|
||||||
|
def load_files(nb_compagnie):
|
||||||
|
"""
|
||||||
|
load and preprocess dataframes
|
||||||
|
"""
|
||||||
|
|
||||||
|
customer = pd.DataFrame()
|
||||||
|
campaigns_brut = pd.DataFrame()
|
||||||
|
campaigns_kpi = pd.DataFrame()
|
||||||
|
products = pd.DataFrame()
|
||||||
|
tickets = pd.DataFrame()
|
||||||
|
targets = pd.DataFrame()
|
||||||
|
|
||||||
|
for directory_path in nb_compagnie:
|
||||||
|
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
||||||
|
df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
||||||
|
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
||||||
|
df_target_information = display_input_databases(directory_path, file_name = "target_information")
|
||||||
|
|
||||||
|
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC'))
|
||||||
|
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
|
||||||
|
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
||||||
|
df_target_KPI = targets_KPI(df_target = df_target_information)
|
||||||
|
|
||||||
|
# Merge and
|
||||||
|
df_target_KPI = pd.merge(df_customerplus_clean_0[['customer_id']], df_target_KPI, how = 'left', on = 'customer_id')
|
||||||
|
targets_columns = list(df_target_KPI.columns)
|
||||||
|
targets_columns.remove('customer_id')
|
||||||
|
df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
|
||||||
|
|
||||||
|
# Create company identifier
|
||||||
|
df_tickets_kpi["number_company"]=int(directory_path)
|
||||||
|
df_campaigns_brut["number_company"]=int(directory_path)
|
||||||
|
df_campaigns_kpi["number_company"]=int(directory_path)
|
||||||
|
df_customerplus_clean["number_company"]=int(directory_path)
|
||||||
|
df_target_information["number_company"]=int(directory_path)
|
||||||
|
df_target_KPI["number_company"]=int(directory_path)
|
||||||
|
|
||||||
|
# Clean index
|
||||||
|
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
|
||||||
|
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
|
||||||
|
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
|
||||||
|
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
|
||||||
|
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
|
||||||
|
|
||||||
|
# Remove companies' outliers
|
||||||
|
df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
|
||||||
|
# harmonize set of customers across databases
|
||||||
|
customer_id = df_tickets_kpi['customer_id'].to_list()
|
||||||
|
for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
|
||||||
|
dataset = dataset[dataset['customer_id'].isin(customer_id)]
|
||||||
|
|
||||||
|
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
|
||||||
|
|
||||||
|
|
||||||
|
# Concatenation
|
||||||
|
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
|
||||||
|
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
|
||||||
|
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
|
||||||
|
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
|
||||||
|
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
|
||||||
|
targets = pd.concat([targets, df_target_KPI], ignore_index=True)
|
||||||
|
|
||||||
|
return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
|
||||||
|
|
||||||
|
|
||||||
|
def remove_outlier_total_amount(tickets : pd.DataFrame):
|
||||||
|
Q1 = tickets['total_amount'].quantile(0.25)
|
||||||
|
Q3 = tickets['total_amount'].quantile(0.75)
|
||||||
|
IQR = Q3 - Q1
|
||||||
|
upper = Q3 +1.5*IQR
|
||||||
|
outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
|
||||||
|
tickets = tickets[~tickets['customer_id'].isin(outliers)]
|
||||||
|
return tickets
|
||||||
|
|
||||||
|
|
||||||
|
def save_file_s3(File_name, type_of_activity):
|
||||||
|
"""
|
||||||
|
save plots into s3 storage
|
||||||
|
"""
|
||||||
|
image_buffer = io.BytesIO()
|
||||||
|
plt.savefig(image_buffer, format='png', pad_inches=1, bbox_inches="tight", dpi = 150)
|
||||||
|
image_buffer.seek(0)
|
||||||
|
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_0_Descriptive_Statistics/{type_of_activity}/"
|
||||||
|
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||||
|
s3_file.write(image_buffer.read())
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
|
def outlier_detection(tickets : pd.DataFrame, company_list, show_diagram=False):
|
||||||
|
"""
|
||||||
|
detect anonymous customers
|
||||||
|
"""
|
||||||
|
outlier_list = list()
|
||||||
|
|
||||||
|
for company in company_list:
|
||||||
|
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
|
||||||
|
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
|
||||||
|
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
|
||||||
|
|
||||||
|
total_amount_share_index = total_amount_share.set_index('customer_id')
|
||||||
|
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
|
||||||
|
#print('df circulaire : ', df_circulaire.head())
|
||||||
|
top = df_circulaire[:1]
|
||||||
|
#print('top : ', top)
|
||||||
|
outlier_list.append(top.index[0])
|
||||||
|
rest = df_circulaire[1:]
|
||||||
|
|
||||||
|
rest_sum = rest.sum()
|
||||||
|
|
||||||
|
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
|
||||||
|
|
||||||
|
if show_diagram:
|
||||||
|
plt.figure(figsize=(3, 3))
|
||||||
|
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
|
||||||
|
plt.axis('equal')
|
||||||
|
# plt.title(f'Répartition des montants totaux pour la compagnie {company}')
|
||||||
|
plt.show()
|
||||||
|
return outlier_list
|
||||||
|
|
||||||
|
|
||||||
|
def valid_customer_detection(products : pd.DataFrame, campaigns_brut : pd.DataFrame):
|
||||||
|
"""
|
||||||
|
identify customer that are in our time perimeter
|
||||||
|
"""
|
||||||
|
products_valid = products[products['purchase_date']>="2021-05-01"]
|
||||||
|
consumer_valid_product = products_valid['customer_id'].to_list()
|
||||||
|
|
||||||
|
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
|
||||||
|
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
|
||||||
|
|
||||||
|
consumer_valid = consumer_valid_product + consumer_valid_campaigns
|
||||||
|
return consumer_valid
|
||||||
|
|
||||||
|
|
||||||
|
def identify_purchase_during_target_periode(products : pd.DataFrame):
|
||||||
|
"""
|
||||||
|
identify customer who purchased ticket during the target period
|
||||||
|
"""
|
||||||
|
products_target_period = products[(products['purchase_date']>="2022-11-01")
|
||||||
|
& (products['purchase_date']<="2023-11-01")]
|
||||||
|
customer_target_period = products_target_period['customer_id'].to_list()
|
||||||
|
return customer_target_period
|
||||||
|
|
||||||
|
|
||||||
|
def remove_elements(lst, elements_to_remove):
|
||||||
|
return ''.join([x for x in lst if x not in elements_to_remove])
|
||||||
|
|
||||||
|
|
||||||
|
def compute_nb_clients(customer: pd.DataFrame, type_of_activity: str):
|
||||||
|
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
|
||||||
|
plt.figure(figsize=(4,3))
|
||||||
|
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel("Number of clients (thousands)")
|
||||||
|
# plt.title(f"Number of clients Across {type_of_activity} Companies")
|
||||||
|
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("nb_clients_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def maximum_price_paid(customer: pd.DataFrame, type_of_activity: str):
|
||||||
|
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
|
||||||
|
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
|
||||||
|
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel("Maximal price of a ticket Prix")
|
||||||
|
# plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies")
|
||||||
|
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("Maximal_price_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def target_proportion(customer: pd.DataFrame, type_of_activity: str):
|
||||||
|
df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
|
||||||
|
'customer_id' : 'nunique'}).reset_index()
|
||||||
|
df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
|
||||||
|
plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel('Share (%)')
|
||||||
|
# plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
|
||||||
|
plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("share_target_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def mailing_consent(customer: pd.DataFrame, type_of_activity: str):
|
||||||
|
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
|
||||||
|
mailing_consent["opt_in"] *= 100
|
||||||
|
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
|
||||||
|
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel('Mailing Consent (%)')
|
||||||
|
# plt.title(f'Consent of mailing Across {type_of_activity} Companies')
|
||||||
|
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("mailing_consent_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def mailing_consent_by_target(customer: pd.DataFrame, type_of_activity: str):
|
||||||
|
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
|
||||||
|
# Création du barplot groupé
|
||||||
|
fig, ax = plt.subplots(figsize=(5, 3))
|
||||||
|
|
||||||
|
categories = df_graph["number_company"].unique()
|
||||||
|
bar_width = 0.35
|
||||||
|
bar_positions = np.arange(len(categories))
|
||||||
|
|
||||||
|
# Grouper les données par label et créer les barres groupées
|
||||||
|
for label in df_graph["has_purchased_target_period"].unique():
|
||||||
|
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
|
||||||
|
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
||||||
|
|
||||||
|
label_printed = "Purchase" if label else "No purchase"
|
||||||
|
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
||||||
|
|
||||||
|
# Mise à jour des positions des barres pour le prochain groupe
|
||||||
|
bar_positions = [pos + bar_width for pos in bar_positions]
|
||||||
|
|
||||||
|
# Ajout des étiquettes, de la légende, etc.
|
||||||
|
ax.set_xlabel('Company Number')
|
||||||
|
ax.set_ylabel('Mailing Consent (%)')
|
||||||
|
# ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies')
|
||||||
|
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
||||||
|
ax.set_xticklabels(categories)
|
||||||
|
ax.legend()
|
||||||
|
|
||||||
|
# Affichage du plot
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("mailing_consent_target_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def gender_bar(customer: pd.DataFrame, type_of_activity: str):
|
||||||
|
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
|
||||||
|
|
||||||
|
company_genders["gender_male"] *= 100
|
||||||
|
company_genders["gender_female"] *= 100
|
||||||
|
company_genders["gender_other"] *= 100
|
||||||
|
|
||||||
|
# Création du barplot
|
||||||
|
plt.figure(figsize=(4,3))
|
||||||
|
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male")
|
||||||
|
plt.bar(company_genders["number_company"], company_genders["gender_female"],
|
||||||
|
bottom = company_genders["gender_male"], label = "Female")
|
||||||
|
plt.bar(company_genders["number_company"], company_genders["gender_other"],
|
||||||
|
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown")
|
||||||
|
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel("Frequency (%)")
|
||||||
|
# plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies")
|
||||||
|
plt.legend()
|
||||||
|
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("gender_bar_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def country_bar(customer: pd.DataFrame, type_of_activity: str):
|
||||||
|
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
|
||||||
|
company_country_fr["country_fr"] *= 100
|
||||||
|
plt.figure(figsize=(4,3))
|
||||||
|
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel("Share of French Customer (%)")
|
||||||
|
# plt.title(f"Share of French Customer Across {type_of_activity} Companies")
|
||||||
|
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("country_bar_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def lazy_customer_plot(campaigns_kpi: pd.DataFrame, type_of_activity: str):
|
||||||
|
company_lazy_customers = campaigns_kpi.groupby("number_company")[["nb_campaigns", "taux_ouverture_mail"]].mean().reset_index()
|
||||||
|
company_lazy_customers["taux_ouverture_mail"] *= 100
|
||||||
|
|
||||||
|
# Initialize the figure
|
||||||
|
fig, ax1 = plt.subplots(figsize=(6, 3))
|
||||||
|
width = 0.4
|
||||||
|
x = range(len(company_lazy_customers))
|
||||||
|
|
||||||
|
# Plot the bars for "nb_campaigns" on the first y-axis
|
||||||
|
ax1.bar([i - width/2 for i in x], company_lazy_customers['nb_campaigns'], width=width, align='center', label='Amount of Campaigns', color = 'steelblue')
|
||||||
|
|
||||||
|
# Set labels and title for the first y-axis
|
||||||
|
ax1.set_ylabel('Number of Mails Received', color='steelblue')
|
||||||
|
ax1.tick_params(axis='y', labelcolor='steelblue')
|
||||||
|
|
||||||
|
# Create another y-axis for "taux_ouverture_mail"
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
|
||||||
|
# Plot the bars for "taux_ouverture_mail" on the second y-axis
|
||||||
|
ax2.bar([i + width/2 for i in x], company_lazy_customers['taux_ouverture_mail'], width=width, align='center', label='Open Mail Rate', color = 'darkorange')
|
||||||
|
|
||||||
|
# Set labels and title for the second y-axis
|
||||||
|
ax2.set_ylabel('Open Mail Rate (%)', color='darkorange')
|
||||||
|
ax2.tick_params(axis='y', labelcolor='darkorange')
|
||||||
|
|
||||||
|
# Set x-axis ticks and labels
|
||||||
|
ax1.set_xticks(x)
|
||||||
|
ax1.set_xticklabels(company_lazy_customers['number_company'])
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("lazy_customer_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def campaigns_effectiveness(customer: pd.DataFrame, type_of_activity: str):
|
||||||
|
|
||||||
|
campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(5, 3))
|
||||||
|
|
||||||
|
categories = campaigns_effectiveness["number_company"].unique()
|
||||||
|
bar_width = 0.35
|
||||||
|
bar_positions = np.arange(len(categories))
|
||||||
|
|
||||||
|
# Grouper les données par label et créer les barres groupées
|
||||||
|
for label in campaigns_effectiveness["has_purchased_target_period"].unique():
|
||||||
|
label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
|
||||||
|
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
||||||
|
|
||||||
|
label_printed = "Purchase" if label else "No purchase"
|
||||||
|
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
||||||
|
|
||||||
|
# Mise à jour des positions des barres pour le prochain groupe
|
||||||
|
bar_positions = [pos + bar_width for pos in bar_positions]
|
||||||
|
|
||||||
|
# Ajout des étiquettes, de la légende, etc.
|
||||||
|
ax.set_xlabel('Company Number')
|
||||||
|
ax.set_ylabel('Share of Consent (%)')
|
||||||
|
# ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)")
|
||||||
|
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
||||||
|
ax.set_xticklabels(categories)
|
||||||
|
ax.legend()
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("campaigns_effectiveness_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def sale_dynamics(products : pd.DataFrame, campaigns_brut : pd.DataFrame, type_of_activity):
|
||||||
|
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
|
||||||
|
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
|
||||||
|
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
|
||||||
|
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
|
||||||
|
|
||||||
|
# Mois du premier mails
|
||||||
|
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
|
||||||
|
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
|
||||||
|
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
|
||||||
|
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
|
||||||
|
|
||||||
|
# Fusion
|
||||||
|
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
|
||||||
|
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
|
||||||
|
|
||||||
|
# Mois à partir duquel le client est considere comme connu
|
||||||
|
|
||||||
|
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
|
||||||
|
|
||||||
|
# Nombre de commande par mois
|
||||||
|
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
|
||||||
|
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
|
||||||
|
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
|
||||||
|
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
|
||||||
|
|
||||||
|
# Nombre de commande par mois par type de client
|
||||||
|
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
|
||||||
|
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
|
||||||
|
|
||||||
|
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
|
||||||
|
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
|
||||||
|
|
||||||
|
# Graphique en nombre de commande
|
||||||
|
purchases_graph = nb_purchases_graph
|
||||||
|
|
||||||
|
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
|
||||||
|
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
|
||||||
|
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
|
||||||
|
|
||||||
|
|
||||||
|
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
|
||||||
|
plt.figure(figsize=(5.5,4))
|
||||||
|
|
||||||
|
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers")
|
||||||
|
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
|
||||||
|
bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers")
|
||||||
|
|
||||||
|
|
||||||
|
# commande pr afficher slt
|
||||||
|
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
|
||||||
|
|
||||||
|
plt.xlabel('Month')
|
||||||
|
plt.ylabel("Number of Sales")
|
||||||
|
# plt.title(f"Number of Sales Across {type_of_activity} Companies")
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("sale_dynamics_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def tickets_internet(tickets: pd.DataFrame, type_of_activity: str):
|
||||||
|
nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
|
||||||
|
nb_tickets_internet['prop_purchases_internet'] *=100
|
||||||
|
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"])
|
||||||
|
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel("Share of Purchases Bought Online (%)")
|
||||||
|
# plt.title(f"Share of Online Purchases Across {type_of_activity} Companies")
|
||||||
|
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("tickets_internet_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def already_bought_online(tickets: pd.DataFrame, type_of_activity: str):
|
||||||
|
nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
|
||||||
|
'customer_id' : 'nunique'}
|
||||||
|
).reset_index())
|
||||||
|
nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100
|
||||||
|
|
||||||
|
plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"])
|
||||||
|
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel("Share of Customer who Bought Online at least once (%)")
|
||||||
|
# plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies")
|
||||||
|
plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("First_buy_internet_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def box_plot_price_tickets(tickets: pd.DataFrame, type_of_activity: str):
|
||||||
|
price_tickets = tickets[(tickets['total_amount'] > 0)]
|
||||||
|
plt.figure(figsize=(4,3))
|
||||||
|
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
|
||||||
|
# plt.title(f"Box plot of price tickets Across {type_of_activity} Companies")
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel("Total Amount Spent")
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("box_plot_price_tickets_", type_of_activity)
|
||||||
|
|
||||||
|
def target_description(targets : pd.DataFrame, type_of_activity: str):
|
||||||
|
|
||||||
|
describe_target = targets.groupby('number_company').agg(
|
||||||
|
prop_target_jeune=('target_jeune', lambda x: (x.sum() / x.count())*100),
|
||||||
|
prop_target_scolaire=('target_scolaire', lambda x: (x.sum() / x.count())*100),
|
||||||
|
prop_target_entreprise=('target_entreprise', lambda x: (x.sum() / x.count())*100),
|
||||||
|
prop_target_famille=('target_famille', lambda x: (x.sum() / x.count())*100),
|
||||||
|
prop_target_optin=('target_optin', lambda x: (x.sum() / x.count())*100),
|
||||||
|
prop_target_optout=('target_optout', lambda x: (x.sum() / x.count())*100),
|
||||||
|
prop_target_newsletter=('target_newsletter', lambda x: (x.sum() / x.count())*100),
|
||||||
|
prop_target_abonne=('target_abonne', lambda x: (x.sum() / x.count())*100))
|
||||||
|
|
||||||
|
plot = describe_target.plot.bar()
|
||||||
|
|
||||||
|
# Adding a title
|
||||||
|
# plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies")
|
||||||
|
|
||||||
|
# Adding labels for x and y axes
|
||||||
|
plot.set_xlabel("Company Number")
|
||||||
|
plot.set_ylabel("Target Proportion")
|
||||||
|
|
||||||
|
plot.set_xticklabels(plot.get_xticklabels(), rotation=0, horizontalalignment='center')
|
||||||
|
|
||||||
|
|
||||||
|
# Adding a legend
|
||||||
|
plot.legend(["Youth", "School", "Enterprise", "Family", "Optin", "Optout", "Newsletter", "Subscriber"], title="Target Category")
|
||||||
|
|
||||||
|
save_file_s3("target_category_proportion_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user