Merge branch 'main' into segment_value
This commit is contained in:
commit
3d6414728c
|
@ -1,5 +1,8 @@
|
|||
# Business Data Challenge - Team 1
|
||||
# Purpose of the script : Construction of training and test datasets for modelling by company
|
||||
# Input : KPI construction function and clean databases in the 0_Input folder
|
||||
# Output : Train and test datasets by compagnies
|
||||
|
||||
# Packages
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
|
@ -9,12 +12,10 @@ import warnings
|
|||
from datetime import date, timedelta, datetime
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
# Create filesystem object
|
||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||
|
||||
|
||||
# Import KPI construction functions
|
||||
exec(open('0_KPI_functions.py').read())
|
||||
|
||||
|
@ -24,53 +25,69 @@ warnings.filterwarnings('ignore')
|
|||
|
||||
def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||
|
||||
# Import customerplus
|
||||
# Import of cleaned and merged datasets
|
||||
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
||||
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
||||
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
||||
df_target_information = display_input_databases(directory_path, file_name = "target_information")
|
||||
|
||||
# if directory_path == "101":
|
||||
# df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
|
||||
# df_products_purchased_reduced = pd.concat([df_products_purchased_reduced, df_products_purchased_reduced_1])
|
||||
|
||||
# Filtre de cohérence pour la mise en pratique de notre méthode
|
||||
# Dates in datetime format
|
||||
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
|
||||
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
|
||||
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
||||
|
||||
#Filtre de la base df_campaigns_information
|
||||
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
||||
# Filter for database df_campaigns_information
|
||||
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
||||
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
||||
|
||||
#Filtre de la base df_products_purchased_reduced
|
||||
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
||||
# Filter for database df_products_purchased_reduced
|
||||
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
||||
|
||||
print("Data filtering : SUCCESS")
|
||||
|
||||
# Fusion de l'ensemble et creation des KPI
|
||||
# Building and merging features
|
||||
|
||||
# KPI sur les campagnes publicitaires
|
||||
# Campaigns features
|
||||
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
|
||||
|
||||
# KPI sur le comportement d'achat
|
||||
# Purchasing behavior features
|
||||
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
|
||||
|
||||
# KPI sur les données socio-démographiques
|
||||
# Socio-demographic features
|
||||
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
||||
|
||||
# Targets features
|
||||
df_targets_kpi = targets_KPI(df_target = df_target_information)
|
||||
|
||||
print("KPIs construction : SUCCESS")
|
||||
|
||||
# Fusion avec KPI liés au customer
|
||||
# Merge - campaigns features
|
||||
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
||||
|
||||
# Fill NaN values
|
||||
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
|
||||
df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
|
||||
df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
|
||||
|
||||
# Fusion avec KPI liés au comportement d'achat
|
||||
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
|
||||
# Merge - targets features
|
||||
df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
|
||||
|
||||
# Fill NaN values
|
||||
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
|
||||
targets_columns = list(df_targets_kpi.columns)
|
||||
targets_columns.remove('customer_id')
|
||||
|
||||
df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
|
||||
|
||||
# We standardise the number of targets closely linked to the company's operations
|
||||
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
|
||||
|
||||
# Merge - purchasing behavior features
|
||||
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
|
||||
|
||||
# Fill NaN values
|
||||
special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']
|
||||
simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
|
||||
|
||||
df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
|
||||
|
||||
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
|
||||
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
|
||||
|
@ -82,9 +99,9 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
|||
print("Explanatory variable construction : SUCCESS")
|
||||
|
||||
# 2. Construction of the explained variable
|
||||
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
|
||||
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
|
||||
|
||||
# Indicatrice d'achat
|
||||
# Construction of the dependant variable
|
||||
df_products_purchased_to_predict['y_has_purchased'] = 1
|
||||
|
||||
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
|
||||
|
@ -103,28 +120,24 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
|||
return dataset
|
||||
|
||||
## Exportation
|
||||
|
||||
# Sectors
|
||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
||||
'sport': ['5', '6', '7', '8', '9'],
|
||||
'musique' : ['10', '11', '12', '13', '14']}
|
||||
|
||||
# Choosed sector
|
||||
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
list_of_comp = companies[type_of_comp]
|
||||
# Dossier d'exportation
|
||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
|
||||
|
||||
# Create test dataset and train dataset for sport companies
|
||||
|
||||
|
||||
#start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_features = 0.7)
|
||||
|
||||
# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
|
||||
# Export folder
|
||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
|
||||
|
||||
# Dates used for the construction of features and the dependant variable
|
||||
start_date = "2021-05-01"
|
||||
end_of_features = "2022-11-01"
|
||||
final_date = "2023-11-01"
|
||||
|
||||
|
||||
# Anonymous customer to be deleted from the datasets
|
||||
anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
|
||||
'5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
|
||||
'10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
|
||||
|
@ -133,33 +146,23 @@ for company in list_of_comp:
|
|||
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
|
||||
max_date = final_date, directory_path = company)
|
||||
|
||||
# On retire le client anonyme
|
||||
# Deletion of the anonymous customer
|
||||
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
|
||||
|
||||
|
||||
# #train test set
|
||||
# np.random.seed(42)
|
||||
|
||||
# split_ratio = 0.7
|
||||
# split_index = int(len(dataset) * split_ratio)
|
||||
# dataset = dataset.sample(frac=1).reset_index(drop=True)
|
||||
# dataset_train = dataset.iloc[:split_index]
|
||||
# dataset_test = dataset.iloc[split_index:]
|
||||
|
||||
# Split between train and test
|
||||
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
|
||||
|
||||
# Dataset Test
|
||||
# Exportation
|
||||
# Export
|
||||
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
|
||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
|
||||
|
||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||
dataset_test.to_csv(file_out, index = False)
|
||||
|
||||
print("Exportation dataset test : SUCCESS")
|
||||
print("Export of dataset test : SUCCESS")
|
||||
|
||||
# Dataset train
|
||||
|
||||
# Export
|
||||
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
|
||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
|
||||
|
@ -167,7 +170,7 @@ for company in list_of_comp:
|
|||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||
dataset_train.to_csv(file_out, index = False)
|
||||
|
||||
print("Exportation dataset train : SUCCESS")
|
||||
print("Export of dataset train : SUCCESS")
|
||||
|
||||
|
||||
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
|
||||
print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")
|
||||
|
|
|
@ -21,7 +21,7 @@ warnings.filterwarnings('ignore')
|
|||
|
||||
# functions
|
||||
def generate_test_set(type_of_comp):
|
||||
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Test_set")
|
||||
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Test_set")
|
||||
test_set = pd.DataFrame()
|
||||
for file in file_path_list:
|
||||
print(file)
|
||||
|
@ -32,7 +32,7 @@ def generate_test_set(type_of_comp):
|
|||
|
||||
|
||||
def generate_train_set(type_of_comp):
|
||||
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Train_set")
|
||||
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Train_set")
|
||||
train_set = pd.DataFrame()
|
||||
for file in file_path_list:
|
||||
print(file)
|
||||
|
@ -43,7 +43,7 @@ def generate_train_set(type_of_comp):
|
|||
|
||||
|
||||
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/'
|
||||
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}/'
|
||||
|
||||
# create test and train datasets
|
||||
test_set = generate_test_set(type_of_comp)
|
||||
|
|
74
0_4_Generate_stat_desc.py
Normal file
74
0_4_Generate_stat_desc.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import warnings
|
||||
|
||||
# Ignore warning
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
exec(open('0_KPI_functions.py').read())
|
||||
exec(open('utils_stat_desc.py').read())
|
||||
|
||||
# Create filesystem object
|
||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||
|
||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
||||
'sport': ['5', '6', '7', '8', '9'],
|
||||
'musique' : ['10', '11', '12', '13', '14']}
|
||||
|
||||
|
||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
list_of_comp = companies[type_of_activity]
|
||||
|
||||
# Load files
|
||||
customer, campaigns_kpi, campaigns_brut, tickets, products, targets = load_files(list_of_comp)
|
||||
|
||||
# Identify anonymous customer for each company and remove them from our datasets
|
||||
outlier_list = outlier_detection(tickets, list_of_comp)
|
||||
|
||||
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
|
||||
customer_valid_list = valid_customer_detection(products, campaigns_brut)
|
||||
|
||||
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
|
||||
|
||||
for dataset in databases:
|
||||
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
|
||||
dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
|
||||
#print(f'shape of {dataset} : ', dataset.shape)
|
||||
|
||||
# Identify customer who bought during the period of y
|
||||
customer_target_period = identify_purchase_during_target_periode(products)
|
||||
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
|
||||
|
||||
# Generate graph and automatically saved them in the bucket
|
||||
compute_nb_clients(customer, type_of_activity)
|
||||
|
||||
#maximum_price_paid(customer, type_of_activity)
|
||||
|
||||
target_proportion(customer, type_of_activity)
|
||||
|
||||
mailing_consent(customer, type_of_activity)
|
||||
|
||||
mailing_consent_by_target(customer)
|
||||
|
||||
gender_bar(customer, type_of_activity)
|
||||
|
||||
country_bar(customer, type_of_activity)
|
||||
|
||||
lazy_customer_plot(campaigns_kpi, type_of_activity)
|
||||
|
||||
campaigns_effectiveness(customer, type_of_activity)
|
||||
|
||||
sale_dynamics(products, campaigns_brut, type_of_activity)
|
||||
|
||||
tickets_internet(tickets, type_of_activity)
|
||||
|
||||
already_bought_online(tickets, type_of_activity)
|
||||
|
||||
box_plot_price_tickets(tickets, type_of_activity)
|
||||
|
||||
target_description(targets, type_of_activity)
|
87
0_5_Machine_Learning.py
Normal file
87
0_5_Machine_Learning.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
|
||||
from sklearn.utils import class_weight
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.calibration import calibration_curve
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
||||
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
|
||||
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
|
||||
exec(open('utils_ml.py').read())
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
warnings.filterwarnings("ignore", category=DataConversionWarning)
|
||||
|
||||
# choose the type of companies for which you want to run the pipeline
|
||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
# choose the type of model
|
||||
type_of_model = input('Choisissez le type de model : basique ? premium ?')
|
||||
|
||||
# load train and test set
|
||||
# Create filesystem object
|
||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||
|
||||
dataset_train, dataset_test = load_train_test(type_of_activity, type_of_model)
|
||||
|
||||
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
|
||||
|
||||
print("Shape train : ", X_train.shape)
|
||||
print("Shape test : ", X_test.shape)
|
||||
|
||||
# processing
|
||||
|
||||
weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
|
||||
y = y_train['y_has_purchased'])
|
||||
|
||||
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
|
||||
|
||||
preproc = preprocess(type_of_model, type_of_activity)
|
||||
|
||||
# Object for storing results
|
||||
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
|
||||
|
||||
# Naive Bayes
|
||||
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||
print("Naive Bayes : Done")
|
||||
|
||||
# Logistic Regression
|
||||
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||
print("Logistic : Done")
|
||||
|
||||
model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
|
||||
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||
print("Logistic CV : Done")
|
||||
|
||||
# Random Forest
|
||||
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||
print("Random Forest : Done")
|
||||
|
||||
model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
|
||||
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||
print("Random Forest CV: Done")
|
||||
|
||||
# Save result
|
||||
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
40
0_6_Segmentation.py
Normal file
40
0_6_Segmentation.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
|
||||
exec(open('utils_segmentation.py').read())
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
# Create filesystem object
|
||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||
|
||||
# choose the type of companies for which you want to run the pipeline
|
||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
|
||||
# load test set
|
||||
dataset_test = load_test_file(type_of_activity)
|
||||
|
||||
# Load Model
|
||||
model = load_model(type_of_activity, 'LogisticRegression_Benchmark')
|
||||
|
||||
# Processing
|
||||
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
||||
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
|
||||
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
|
||||
|
||||
y_test = dataset_test[['y_has_purchased']]
|
||||
|
||||
# Prediction
|
||||
y_pred_prob = model.predict_proba(X_test)[:, 1]
|
||||
|
||||
# Add probability to dataset_test
|
||||
dataset_test['Probability_to_buy'] = y_pred_prob
|
||||
print('probability added to dataset_test')
|
||||
print(dataset_test.head())
|
|
@ -74,7 +74,7 @@ def preprocessing_customerplus(directory_path):
|
|||
cleaning_date(customerplus_copy, 'last_visiting_date')
|
||||
|
||||
# Selection des variables
|
||||
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at'], axis = 1, inplace=True)
|
||||
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
|
||||
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
|
||||
|
||||
return customerplus_copy
|
||||
|
|
|
@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
|
|||
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
||||
return df
|
||||
|
||||
def campaigns_kpi_function(campaigns_information = None, max_date = None):
|
||||
def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
|
||||
|
||||
# Nombre de campagnes de mails
|
||||
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
||||
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
||||
|
||||
# Temps d'ouverture moyen (en minutes)
|
||||
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
|
||||
campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
||||
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
||||
|
||||
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
|
||||
|
@ -44,7 +44,6 @@ def campaigns_kpi_function(campaigns_information = None, max_date = None):
|
|||
|
||||
return campaigns_reduced
|
||||
|
||||
|
||||
def tickets_kpi_function(tickets_information = None):
|
||||
|
||||
tickets_information_copy = tickets_information.copy()
|
||||
|
@ -100,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
|
|||
})
|
||||
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
|
||||
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
|
||||
customerplus_clean.drop(columns = "gender", inplace = True)
|
||||
|
||||
|
||||
# Age
|
||||
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
|
||||
|
@ -112,19 +113,53 @@ def customerplus_kpi_function(customerplus_clean = None):
|
|||
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
|
||||
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
|
||||
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
|
||||
# customerplus_clean.drop(columns = "age", inplace = True)
|
||||
|
||||
# Consentement au mailing
|
||||
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
|
||||
|
||||
# Indicatrice si individue vit en France
|
||||
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
||||
# customerplus_clean.drop(columns = "country", inplace = True)
|
||||
|
||||
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
|
||||
# customerplus_clean.drop(columns = "profession", inplace = True)
|
||||
|
||||
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
|
||||
# customerplus_clean.drop(columns = "zipcode", inplace = True)
|
||||
|
||||
# Dummy if the customer has a structure id (tags)
|
||||
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
|
||||
|
||||
return customerplus_clean
|
||||
|
||||
def targets_KPI(df_target = None):
|
||||
|
||||
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
|
||||
|
||||
# Target name cotegory musees /
|
||||
df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
|
||||
df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
|
||||
df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
|
||||
df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
|
||||
df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
|
||||
df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
|
||||
df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
|
||||
|
||||
# Target name category for sport compagnies
|
||||
df_target['target_abonne'] = ((
|
||||
df_target['target_name']
|
||||
.str.contains('|'.join(['abo', 'adh']), case=False)
|
||||
& ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
|
||||
).astype(int))
|
||||
|
||||
df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
|
||||
|
||||
target_agg = df_target.groupby('customer_id').agg(
|
||||
nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes
|
||||
# all_targets=('target_name', concatenate_names),
|
||||
# all_target_types=('target_type_name', concatenate_names)
|
||||
).reset_index()
|
||||
|
||||
target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
|
||||
|
||||
return target_agg
|
||||
|
148
Descriptive_statistics/debug.ipynb
Normal file
148
Descriptive_statistics/debug.ipynb
Normal file
File diff suppressed because one or more lines are too long
68
Descriptive_statistics/generate_stat_desc.py
Normal file
68
Descriptive_statistics/generate_stat_desc.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import warnings
|
||||
|
||||
# Ignore warning
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
exec(open('../0_KPI_functions.py').read())
|
||||
exec(open('plot.py').read())
|
||||
|
||||
# Create filesystem object
|
||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||
|
||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
||||
'sport': ['5'],
|
||||
'musique' : ['10', '11', '12', '13', '14']}
|
||||
|
||||
|
||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||
list_of_comp = companies[type_of_activity]
|
||||
|
||||
# Load files
|
||||
customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
|
||||
|
||||
# Identify anonymous customer for each company and remove them from our datasets
|
||||
outlier_list = outlier_detection(tickets, list_of_comp)
|
||||
|
||||
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
|
||||
customer_valid_list = valid_customer_detection(products, campaigns_brut)
|
||||
|
||||
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
|
||||
|
||||
for dataset in databases:
|
||||
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
|
||||
dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
|
||||
#print(f'shape of {dataset} : ', dataset.shape)
|
||||
|
||||
# Identify customer who bought during the period of y
|
||||
customer_target_period = identify_purchase_during_target_periode(products)
|
||||
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
|
||||
|
||||
# Generate graph and automatically saved them in the bucket
|
||||
compute_nb_clients(customer, type_of_activity)
|
||||
|
||||
maximum_price_paid(customer, type_of_activity)
|
||||
|
||||
mailing_consent(customer, type_of_activity)
|
||||
|
||||
mailing_consent_by_target(customer)
|
||||
|
||||
gender_bar(customer, type_of_activity)
|
||||
|
||||
country_bar(customer, type_of_activity)
|
||||
|
||||
lazy_customer_plot(campaigns_kpi, type_of_activity)
|
||||
|
||||
#campaigns_effectiveness(customer, type_of_activity)
|
||||
|
||||
sale_dynamics(products, campaigns_brut, type_of_activity)
|
||||
|
||||
tickets_internet(tickets, type_of_activity)
|
||||
|
||||
box_plot_price_tickets(tickets, type_of_activity)
|
328
Descriptive_statistics/plot.py
Normal file
328
Descriptive_statistics/plot.py
Normal file
|
@ -0,0 +1,328 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import s3fs
|
||||
import io
|
||||
import warnings
|
||||
from datetime import date, timedelta, datetime
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.dates as mdates
|
||||
import seaborn as sns
|
||||
|
||||
|
||||
def load_files(nb_compagnie):
|
||||
customer = pd.DataFrame()
|
||||
campaigns_brut = pd.DataFrame()
|
||||
campaigns_kpi = pd.DataFrame()
|
||||
products = pd.DataFrame()
|
||||
tickets = pd.DataFrame()
|
||||
|
||||
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
|
||||
for directory_path in nb_compagnie:
|
||||
df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
|
||||
df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
||||
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
||||
df_target_information = display_databases(directory_path, file_name = "target_information")
|
||||
|
||||
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut)
|
||||
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
|
||||
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
||||
|
||||
|
||||
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
|
||||
df_tickets_kpi["number_company"]=int(directory_path)
|
||||
df_campaigns_brut["number_company"]=int(directory_path)
|
||||
df_campaigns_kpi["number_company"]=int(directory_path)
|
||||
df_customerplus_clean["number_company"]=int(directory_path)
|
||||
df_target_information["number_company"]=int(directory_path)
|
||||
|
||||
# Traitement des index
|
||||
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
|
||||
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
|
||||
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
|
||||
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
|
||||
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
|
||||
|
||||
# Concaténation
|
||||
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
|
||||
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
|
||||
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
|
||||
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
|
||||
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
|
||||
|
||||
return customer, campaigns_kpi, campaigns_brut, tickets, products
|
||||
|
||||
|
||||
def save_file_s3(File_name, type_of_activity):
|
||||
image_buffer = io.BytesIO()
|
||||
plt.savefig(image_buffer, format='png')
|
||||
image_buffer.seek(0)
|
||||
FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
|
||||
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||
s3_file.write(image_buffer.read())
|
||||
plt.close()
|
||||
|
||||
|
||||
def outlier_detection(tickets, company_list, show_diagram=False):
|
||||
|
||||
outlier_list = list()
|
||||
|
||||
for company in company_list:
|
||||
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
|
||||
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
|
||||
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
|
||||
|
||||
total_amount_share_index = total_amount_share.set_index('customer_id')
|
||||
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
|
||||
#print('df circulaire : ', df_circulaire.head())
|
||||
top = df_circulaire[:1]
|
||||
#print('top : ', top)
|
||||
outlier_list.append(top.index[0])
|
||||
rest = df_circulaire[1:]
|
||||
|
||||
rest_sum = rest.sum()
|
||||
|
||||
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
|
||||
|
||||
if show_diagram:
|
||||
plt.figure(figsize=(3, 3))
|
||||
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
|
||||
plt.axis('equal')
|
||||
plt.title(f'Répartition des montants totaux pour la compagnie {company}')
|
||||
plt.show()
|
||||
return outlier_list
|
||||
|
||||
|
||||
def valid_customer_detection(products, campaigns_brut):
|
||||
products_valid = products[products['purchase_date']>="2021-05-01"]
|
||||
consumer_valid_product = products_valid['customer_id'].to_list()
|
||||
|
||||
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
|
||||
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
|
||||
|
||||
consumer_valid = consumer_valid_product + consumer_valid_campaigns
|
||||
return consumer_valid
|
||||
|
||||
|
||||
def identify_purchase_during_target_periode(products):
|
||||
products_target_period = products[(products['purchase_date']>="2022-11-01")
|
||||
& (products['purchase_date']<="2023-11-01")]
|
||||
customer_target_period = products_target_period['customer_id'].to_list()
|
||||
return customer_target_period
|
||||
|
||||
|
||||
def remove_elements(lst, elements_to_remove):
|
||||
return ''.join([x for x in lst if x not in elements_to_remove])
|
||||
|
||||
|
||||
def compute_nb_clients(customer, type_of_activity):
|
||||
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
|
||||
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel("Number of clients (thousands)")
|
||||
plt.title(f"Number of clients for {type_of_activity}")
|
||||
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("nb_clients_", type_of_activity)
|
||||
|
||||
|
||||
def maximum_price_paid(customer, type_of_activity):
|
||||
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
|
||||
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel("Maximal price of a ticket Prix")
|
||||
plt.title(f"Maximal price of a ticket for {type_of_activity}")
|
||||
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("Maximal_price_", type_of_activity)
|
||||
|
||||
|
||||
def mailing_consent(customer, type_of_activity):
|
||||
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
|
||||
|
||||
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel('Consent')
|
||||
plt.title(f'Consent of mailing for {type_of_activity}')
|
||||
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("mailing_consent_", type_of_activity)
|
||||
|
||||
|
||||
def mailing_consent_by_target(customer):
|
||||
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
|
||||
# Création du barplot groupé
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
categories = df_graph["number_company"].unique()
|
||||
bar_width = 0.35
|
||||
bar_positions = np.arange(len(categories))
|
||||
|
||||
# Grouper les données par label et créer les barres groupées
|
||||
for label in df_graph["has_purchased_target_period"].unique():
|
||||
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
|
||||
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
||||
|
||||
label_printed = "purchased" if label else "no purchase"
|
||||
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
||||
|
||||
# Mise à jour des positions des barres pour le prochain groupe
|
||||
bar_positions = [pos + bar_width for pos in bar_positions]
|
||||
|
||||
# Ajout des étiquettes, de la légende, etc.
|
||||
ax.set_xlabel('Company')
|
||||
ax.set_ylabel('Consent')
|
||||
ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
|
||||
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
||||
ax.set_xticklabels(categories)
|
||||
ax.legend()
|
||||
|
||||
# Affichage du plot
|
||||
plt.show()
|
||||
save_file_s3("mailing_consent_target_", type_of_activity)
|
||||
|
||||
|
||||
def gender_bar(customer, type_of_activity):
|
||||
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
|
||||
|
||||
# Création du barplot
|
||||
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
|
||||
plt.bar(company_genders["number_company"], company_genders["gender_female"],
|
||||
bottom = company_genders["gender_male"], label = "Femme")
|
||||
plt.bar(company_genders["number_company"], company_genders["gender_other"],
|
||||
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel("Gender")
|
||||
plt.title(f"Gender of Customer for {type_of_activity}")
|
||||
plt.legend()
|
||||
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("gender_bar_", type_of_activity)
|
||||
|
||||
|
||||
def country_bar(customer, type_of_activity):
|
||||
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
|
||||
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel("Share of French Customer")
|
||||
plt.title(f"Share of French Customer for {type_of_activity}")
|
||||
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("country_bar_", type_of_activity)
|
||||
|
||||
|
||||
def lazy_customer_plot(campaigns_kpi, type_of_activity):
|
||||
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
|
||||
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel("Share of Customers who did not open mail")
|
||||
plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
|
||||
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("lazy_customer_", type_of_activity)
|
||||
|
||||
|
||||
def campaigns_effectiveness(customer, type_of_activity):
|
||||
|
||||
campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index()
|
||||
|
||||
plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"])
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel("Number of Customers (thousands)")
|
||||
plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
|
||||
plt.legend()
|
||||
plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("campaigns_effectiveness_", type_of_activity)
|
||||
|
||||
|
||||
def sale_dynamics(products, campaigns_brut, type_of_activity):
|
||||
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
|
||||
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
|
||||
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
|
||||
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
|
||||
|
||||
# Mois du premier mails
|
||||
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
|
||||
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
|
||||
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
|
||||
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
|
||||
|
||||
# Fusion
|
||||
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
|
||||
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
|
||||
|
||||
# Mois à partir duquel le client est considere comme connu
|
||||
|
||||
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
|
||||
|
||||
# Nombre de commande par mois
|
||||
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
|
||||
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
|
||||
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
|
||||
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
|
||||
|
||||
# Nombre de commande par mois par type de client
|
||||
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
|
||||
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
|
||||
|
||||
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
|
||||
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
|
||||
|
||||
# Graphique en nombre de commande
|
||||
purchases_graph = nb_purchases_graph
|
||||
|
||||
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
|
||||
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
|
||||
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
|
||||
|
||||
|
||||
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
|
||||
|
||||
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
|
||||
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
|
||||
bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
|
||||
|
||||
|
||||
# commande pr afficher slt
|
||||
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
|
||||
|
||||
plt.xlabel('Month')
|
||||
plt.ylabel("Number of Sales")
|
||||
plt.title(f"Number of Sales for {type_of_activity}")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
save_file_s3("sale_dynamics_", type_of_activity)
|
||||
|
||||
|
||||
def tickets_internet(tickets, type_of_activity):
|
||||
nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
|
||||
nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
|
||||
|
||||
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["Share_ticket_internet"])
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel("Share of Tickets Bought Online")
|
||||
plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
|
||||
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("tickets_internet_", type_of_activity)
|
||||
|
||||
|
||||
def box_plot_price_tickets(tickets, type_of_activity):
|
||||
price_tickets = tickets[(tickets['total_amount'] > 0)]
|
||||
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
|
||||
plt.title(f"Box plot of price tickets for {type_of_activity}")
|
||||
plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("box_plot_price_tickets_", type_of_activity)
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
8513
Notebook_AR.ipynb
8513
Notebook_AR.ipynb
File diff suppressed because one or more lines are too long
33
README.md
Normal file
33
README.md
Normal file
|
@ -0,0 +1,33 @@
|
|||
# Business data challenge 2023-2024 | ENSAE Paris
|
||||
# Arenametrix : customer segmentation
|
||||
|
||||
## Team 1 :
|
||||
|
||||
* Antoine JOUBREL
|
||||
* Alexis REVELLE
|
||||
* Fanta RODRIGUE
|
||||
* Thomas PIQUÉ
|
||||
|
||||
|
||||
## Coaches :
|
||||
|
||||
* Elia LAPENTA
|
||||
* Michael VISSER
|
||||
|
||||
|
||||
### Description of the problematic
|
||||
The goal of this project is to create segments of customers from 15 companies belonging to 3 different types of activities (sports companies, museum, and music companies).
|
||||
|
||||
### Our approach
|
||||
We opted for a sector-based approach, which means that 3 segmentations have been performed (one for each type of activity).
|
||||
As the segments have to be linked to a probability of future purchase, we directly used the probability of purchase during the incoming year to make segments. The first step of the modelization is a pipeline that fits 3 ML models (naive bayes, random forest, and logistic regression) on the data to predict whether the customer will purchase during the year. We then use the probability of purchase estimated to split the customers into 4 segments. For each segment, we can estimate the potential number of tickets and revenue for the incoming year.
|
||||
|
||||
### How run the code
|
||||
- run 0_1_Input_cleaning.py to clean the raw data and generate dataframes that will be used to build datasets with insightful variables.
|
||||
- run 0_2_Dataset_construction.py.
|
||||
- run 0_3_General_modelization_dataset.py to generate test and train sets for the 3 types of activities.
|
||||
- run the script 0_4_Generate_stat_desc.py to generate graphics describing the data
|
||||
- run 0_5_Machine_Learning.py. 3 ML models will be fitted on the data, and results will be exported for all 3 types of activities
|
||||
- run 0_6_Segmentation.py. The test set will be fitted with the optimal parameters computed previously. That will allow to compute a propensity score (probability of a future purchase). Segmentation is performed according to the scores provided. This scripts exports graphics describing the marketing personae associated to the segments as well as their business value.
|
||||
- run 0_7_CA_segment.py. The scores will be adjusted to better fit the overall probability of a purchase. This score adjusted is used to estimate the number of tickets sold and the revenue generated during the incoming year.
|
||||
|
|
@ -65,7 +65,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 3,
|
||||
"id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -115,9 +115,9 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
"/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
||||
"/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
"/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
||||
]
|
||||
}
|
||||
|
@ -228,7 +228,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 9,
|
||||
"id": "6224fd31-c190-4168-b395-e0bf5806d79d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -238,7 +238,7 @@
|
|||
"{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -254,7 +254,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"execution_count": 10,
|
||||
"id": "4680f202-979e-483f-89b8-9df877203bcf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -265,7 +265,7 @@
|
|||
" 0.54812838])"
|
||||
]
|
||||
},
|
||||
"execution_count": 58,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -282,7 +282,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"execution_count": 11,
|
||||
"id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -311,7 +311,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 258,
|
||||
"execution_count": 12,
|
||||
"id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -650,7 +650,7 @@
|
|||
"[354365 rows x 17 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 258,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -662,7 +662,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 259,
|
||||
"execution_count": 13,
|
||||
"id": "648fb542-0186-493d-b274-be2c26a11967",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -677,7 +677,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 260,
|
||||
"execution_count": 14,
|
||||
"id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -1016,7 +1016,7 @@
|
|||
"[354365 rows x 17 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 260,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -1510,12 +1510,14 @@
|
|||
"\n",
|
||||
"- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
|
||||
"- ajouter un intercept\n",
|
||||
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO "
|
||||
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
|
||||
"\n",
|
||||
"#### A recopier dans la pipeline -> section 2 bis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 266,
|
||||
"execution_count": 15,
|
||||
"id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -1817,7 +1819,7 @@
|
|||
"[354365 rows x 15 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 266,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -1831,7 +1833,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 267,
|
||||
"execution_count": 16,
|
||||
"id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -1847,8 +1849,8 @@
|
|||
"Dep. Variable: y No. Observations: 354365\n",
|
||||
"Model: Logit Df Residuals: 354350\n",
|
||||
"Method: MLE Df Model: 14\n",
|
||||
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 10:07:29 Log-Likelihood: -83135.\n",
|
||||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 07:57:46 Log-Likelihood: -83135.\n",
|
||||
"converged: True LL-Null: -1.0540e+05\n",
|
||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||
"=======================================================================================\n",
|
||||
|
@ -1887,7 +1889,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 268,
|
||||
"execution_count": 17,
|
||||
"id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -1908,7 +1910,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 269,
|
||||
"execution_count": 18,
|
||||
"id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -2210,7 +2212,7 @@
|
|||
"[354365 rows x 15 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 269,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -2221,7 +2223,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 289,
|
||||
"execution_count": 19,
|
||||
"id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -2237,8 +2239,8 @@
|
|||
"Dep. Variable: y No. Observations: 354365\n",
|
||||
"Model: Logit Df Residuals: 354350\n",
|
||||
"Method: MLE Df Model: 14\n",
|
||||
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 10:26:14 Log-Likelihood: -83135.\n",
|
||||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 07:58:13 Log-Likelihood: -83135.\n",
|
||||
"converged: True LL-Null: -1.0540e+05\n",
|
||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||
"=======================================================================================\n",
|
||||
|
@ -2276,12 +2278,226 @@
|
|||
"print(result.summary())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Optimization terminated successfully (Exit mode 0)\n",
|
||||
" Current function value: 0.23562928627877766\n",
|
||||
" Iterations: 240\n",
|
||||
" Function evaluations: 243\n",
|
||||
" Gradient evaluations: 240\n",
|
||||
"const 0.000000e+00\n",
|
||||
"nb_tickets 2.477006e-01\n",
|
||||
"nb_purchases 1.636902e-03\n",
|
||||
"total_amount 8.839088e-04\n",
|
||||
"nb_suppliers 1.906550e-65\n",
|
||||
"vente_internet_max 0.000000e+00\n",
|
||||
"purchase_date_min 0.000000e+00\n",
|
||||
"purchase_date_max 0.000000e+00\n",
|
||||
"nb_tickets_internet 7.232680e-112\n",
|
||||
"is_email_true 8.202187e-08\n",
|
||||
"opt_in 0.000000e+00\n",
|
||||
"gender_female 1.624424e-170\n",
|
||||
"gender_male 4.961315e-220\n",
|
||||
"nb_campaigns 6.276733e-205\n",
|
||||
"nb_campaigns_opened 2.228531e-176\n",
|
||||
"dtype: float64\n",
|
||||
" Logit Regression Results \n",
|
||||
"==============================================================================\n",
|
||||
"Dep. Variable: y No. Observations: 354365\n",
|
||||
"Model: Logit Df Residuals: 354350\n",
|
||||
"Method: MLE Df Model: 14\n",
|
||||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2111\n",
|
||||
"Time: 10:45:37 Log-Likelihood: -83152.\n",
|
||||
"converged: True LL-Null: -1.0540e+05\n",
|
||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||
"=======================================================================================\n",
|
||||
" coef std err z P>|z| [0.025 0.975]\n",
|
||||
"---------------------------------------------------------------------------------------\n",
|
||||
"const -3.1162 0.081 -38.383 0.000 -3.275 -2.957\n",
|
||||
"nb_tickets -0.0136 0.012 -1.156 0.248 -0.037 0.009\n",
|
||||
"nb_purchases -0.0385 0.012 -3.149 0.002 -0.063 -0.015\n",
|
||||
"total_amount 0.0588 0.018 3.325 0.001 0.024 0.094\n",
|
||||
"nb_suppliers 0.1638 0.010 17.085 0.000 0.145 0.183\n",
|
||||
"vente_internet_max -0.8651 0.011 -82.182 0.000 -0.886 -0.844\n",
|
||||
"purchase_date_min 0.5790 0.015 39.391 0.000 0.550 0.608\n",
|
||||
"purchase_date_max -1.4088 0.016 -89.101 0.000 -1.440 -1.378\n",
|
||||
"nb_tickets_internet 0.2857 0.013 22.475 0.000 0.261 0.311\n",
|
||||
"is_email_true 0.4224 0.079 5.363 0.000 0.268 0.577\n",
|
||||
"opt_in -1.9818 0.019 -106.856 0.000 -2.018 -1.945\n",
|
||||
"gender_female 0.6553 0.024 27.835 0.000 0.609 0.701\n",
|
||||
"gender_male 0.7578 0.024 31.663 0.000 0.711 0.805\n",
|
||||
"nb_campaigns 0.2835 0.009 30.547 0.000 0.265 0.302\n",
|
||||
"nb_campaigns_opened 0.2061 0.007 28.315 0.000 0.192 0.220\n",
|
||||
"=======================================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# 2.bis on fait de même pour un modèle logit avec pénalité \n",
|
||||
"# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
|
||||
"\n",
|
||||
"# sans spécification, le alpha optimal est déterminé par cross validation\n",
|
||||
"# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
|
||||
"# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
|
||||
"\n",
|
||||
"result = model_logit.fit_regularized(method='l1', alpha = 32)\n",
|
||||
"\n",
|
||||
"print(result.pvalues)\n",
|
||||
"print(result.summary())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Other method : take into account the weigths ! Pb : with this method, no penalty allowed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 247,
|
||||
"id": "2e3ca381-54e3-445b-bb37-d7ce953cb856",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define a function to generate summaries of logit model\n",
|
||||
"\n",
|
||||
"def model_logit(X, y, weight_dict, add_constant=False) :\n",
|
||||
" # Generate sample weights based on class weights computed earlier\n",
|
||||
" sample_weights = np.array([weight_dict[class_] for class_ in y])\n",
|
||||
"\n",
|
||||
" if add_constant :\n",
|
||||
" X_const = sm.add_constant(X)\n",
|
||||
" else :\n",
|
||||
" X_const = X\n",
|
||||
" \n",
|
||||
" # Use GLM from statsmodels with Binomial family for logistic regression\n",
|
||||
" model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n",
|
||||
" \n",
|
||||
" # fit without penalty\n",
|
||||
" result = model.fit()\n",
|
||||
"\n",
|
||||
" result_summary = result.summary()\n",
|
||||
" \n",
|
||||
" return result_summary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 248,
|
||||
"id": "4cd424a0-7c55-47ff-840e-1354e8dcf863",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Generalized Linear Model Regression Results \n",
|
||||
"==============================================================================\n",
|
||||
"Dep. Variable: y No. Observations: 354365\n",
|
||||
"Model: GLM Df Residuals: 354350\n",
|
||||
"Model Family: Binomial Df Model: 14\n",
|
||||
"Link Function: Logit Scale: 1.0000\n",
|
||||
"Method: IRLS Log-Likelihood: -1.8693e+05\n",
|
||||
"Date: Thu, 21 Mar 2024 Deviance: 3.7387e+05\n",
|
||||
"Time: 13:19:33 Pearson chi2: 1.97e+16\n",
|
||||
"No. Iterations: 100 Pseudo R-squ. (CS): 0.2820\n",
|
||||
"Covariance Type: nonrobust \n",
|
||||
"=======================================================================================\n",
|
||||
" coef std err z P>|z| [0.025 0.975]\n",
|
||||
"---------------------------------------------------------------------------------------\n",
|
||||
"const -1.3943 0.062 -22.456 0.000 -1.516 -1.273\n",
|
||||
"nb_tickets -0.3312 0.016 -20.967 0.000 -0.362 -0.300\n",
|
||||
"nb_purchases 0.9258 0.098 9.491 0.000 0.735 1.117\n",
|
||||
"total_amount 0.8922 0.042 21.393 0.000 0.810 0.974\n",
|
||||
"nb_suppliers 0.2238 0.007 32.137 0.000 0.210 0.237\n",
|
||||
"vente_internet_max -0.7453 0.007 -100.473 0.000 -0.760 -0.731\n",
|
||||
"purchase_date_min 0.7123 0.015 46.063 0.000 0.682 0.743\n",
|
||||
"purchase_date_max -1.3328 0.017 -79.297 0.000 -1.366 -1.300\n",
|
||||
"nb_tickets_internet 0.1784 0.011 16.366 0.000 0.157 0.200\n",
|
||||
"is_email_true 0.8635 0.061 14.086 0.000 0.743 0.984\n",
|
||||
"opt_in -1.7487 0.010 -174.737 0.000 -1.768 -1.729\n",
|
||||
"gender_female 0.8084 0.013 60.803 0.000 0.782 0.835\n",
|
||||
"gender_male 0.8731 0.014 64.332 0.000 0.846 0.900\n",
|
||||
"nb_campaigns 0.1751 0.006 31.101 0.000 0.164 0.186\n",
|
||||
"nb_campaigns_opened 0.2962 0.005 54.145 0.000 0.285 0.307\n",
|
||||
"=======================================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# with the function\n",
|
||||
"\n",
|
||||
"# 1. logit with weights\n",
|
||||
"results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n",
|
||||
"print(results_logit_weight)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 252,
|
||||
"id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Generalized Linear Model Regression Results \n",
|
||||
"==============================================================================\n",
|
||||
"Dep. Variable: y No. Observations: 354365\n",
|
||||
"Model: GLM Df Residuals: 354350\n",
|
||||
"Model Family: Binomial Df Model: 14\n",
|
||||
"Link Function: Logit Scale: 1.0000\n",
|
||||
"Method: IRLS Log-Likelihood: -83141.\n",
|
||||
"Date: Thu, 21 Mar 2024 Deviance: 1.6628e+05\n",
|
||||
"Time: 13:20:06 Pearson chi2: 4.52e+15\n",
|
||||
"No. Iterations: 8 Pseudo R-squ. (CS): 0.1180\n",
|
||||
"Covariance Type: nonrobust \n",
|
||||
"=======================================================================================\n",
|
||||
" coef std err z P>|z| [0.025 0.975]\n",
|
||||
"---------------------------------------------------------------------------------------\n",
|
||||
"const -3.6025 0.091 -39.755 0.000 -3.780 -3.425\n",
|
||||
"nb_tickets -0.0230 0.010 -2.191 0.028 -0.044 -0.002\n",
|
||||
"nb_purchases -0.0519 0.014 -3.609 0.000 -0.080 -0.024\n",
|
||||
"total_amount 0.0799 0.021 3.841 0.000 0.039 0.121\n",
|
||||
"nb_suppliers 0.1694 0.010 17.662 0.000 0.151 0.188\n",
|
||||
"vente_internet_max -0.8764 0.011 -82.965 0.000 -0.897 -0.856\n",
|
||||
"purchase_date_min 0.5881 0.015 39.936 0.000 0.559 0.617\n",
|
||||
"purchase_date_max -1.4197 0.016 -89.592 0.000 -1.451 -1.389\n",
|
||||
"nb_tickets_internet 0.2895 0.013 22.652 0.000 0.264 0.315\n",
|
||||
"is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n",
|
||||
"opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n",
|
||||
"gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n",
|
||||
"gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n",
|
||||
"nb_campaigns 0.2850 0.009 30.633 0.000 0.267 0.303\n",
|
||||
"nb_campaigns_opened 0.2061 0.007 28.245 0.000 0.192 0.220\n",
|
||||
"=======================================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# 2. logit without weights\n",
|
||||
"\n",
|
||||
"results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n",
|
||||
"print(results_logit)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36c5e770-72b3-4482-ad61-45b511a11f06",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## graphique LASSO - quelles variables sont impotantes dans le modèle ? "
|
||||
"## graphique LASSO - quelles variables sont importantes dans le modèle ? "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
410
utils_ml.py
Normal file
410
utils_ml.py
Normal file
|
@ -0,0 +1,410 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import s3fs
|
||||
import re
|
||||
import io
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
|
||||
from sklearn.utils import class_weight
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.calibration import calibration_curve
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
||||
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
|
||||
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
|
||||
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
|
||||
def load_train_test(type_of_activity, type_of_model):
|
||||
BUCKET = f"projet-bdc2324-team1/Generalization_v2/{type_of_activity}"
|
||||
File_path_train = BUCKET + "/Train_set.csv"
|
||||
File_path_test = BUCKET + "/Test_set.csv"
|
||||
|
||||
with fs.open( File_path_train, mode="rb") as file_in:
|
||||
dataset_train = pd.read_csv(file_in, sep=",")
|
||||
# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
|
||||
|
||||
with fs.open(File_path_test, mode="rb") as file_in:
|
||||
dataset_test = pd.read_csv(file_in, sep=",")
|
||||
# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
|
||||
|
||||
if type_of_model=='premium':
|
||||
dataset_train['company'] = dataset_train['customer_id'].apply(lambda x: x.split('_')[0])
|
||||
dataset_test['company'] = dataset_test['customer_id'].apply(lambda x: x.split('_')[0])
|
||||
dataset_train = dataset_train[dataset_train['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
|
||||
dataset_test = dataset_test[dataset_test['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
|
||||
return dataset_train, dataset_test
|
||||
|
||||
|
||||
def save_file_s3(File_name, type_of_activity, type_of_model, model):
|
||||
image_buffer = io.BytesIO()
|
||||
plt.savefig(image_buffer, format='png')
|
||||
image_buffer.seek(0)
|
||||
FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/"
|
||||
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||
s3_file.write(image_buffer.read())
|
||||
plt.close()
|
||||
|
||||
|
||||
def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
|
||||
if model_path:
|
||||
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
|
||||
else:
|
||||
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||
result_set.to_csv(file_out, index = False)
|
||||
|
||||
|
||||
def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
|
||||
model_bytes = pickle.dumps(classifier)
|
||||
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
|
||||
f.write(model_bytes)
|
||||
|
||||
|
||||
def compute_recall(group):
|
||||
return recall_score(group['y_has_purchased'], group['prediction'])
|
||||
|
||||
|
||||
def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
|
||||
test = dataset_test.copy()
|
||||
test['prediction'] = y_pred
|
||||
test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
|
||||
recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
|
||||
save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True)
|
||||
|
||||
|
||||
def features_target_split(dataset_train, dataset_test):
|
||||
features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
|
||||
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
||||
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
|
||||
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',
|
||||
'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
|
||||
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
|
||||
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',
|
||||
'target_jeune', 'target_abonne']
|
||||
X_train = dataset_train[features_l]
|
||||
y_train = dataset_train[['y_has_purchased']]
|
||||
|
||||
X_test = dataset_test[features_l]
|
||||
y_test = dataset_test[['y_has_purchased']]
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def preprocess(type_of_model, type_of_activity):
|
||||
|
||||
numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
|
||||
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
||||
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
|
||||
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']
|
||||
|
||||
binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
|
||||
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
|
||||
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']
|
||||
|
||||
if type_of_activity=='musee':
|
||||
numeric_features.remove('time_to_open')
|
||||
|
||||
if type_of_model=='premium':
|
||||
if type_of_activity=='musique':
|
||||
binary_features.extend(['target_optin', 'target_newsletter'])
|
||||
elif type_of_activity=='sport':
|
||||
binary_features.extend(['target_jeune', 'target_entreprise', 'target_abonne'])
|
||||
else:
|
||||
binary_features.extend([ 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter'])
|
||||
|
||||
|
||||
numeric_transformer = Pipeline(steps=[
|
||||
("imputer", SimpleImputer(strategy="constant", fill_value=0)),
|
||||
("scaler", StandardScaler())
|
||||
])
|
||||
|
||||
binary_transformer = Pipeline(steps=[
|
||||
("imputer", SimpleImputer(strategy="most_frequent")),
|
||||
])
|
||||
preproc = ColumnTransformer(
|
||||
transformers=[
|
||||
("num", numeric_transformer, numeric_features),
|
||||
("bin", binary_transformer, binary_features)
|
||||
]
|
||||
)
|
||||
return preproc
|
||||
|
||||
|
||||
def draw_confusion_matrix(y_test, y_pred, model):
|
||||
conf_matrix = confusion_matrix(y_test, y_pred)
|
||||
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
|
||||
plt.xlabel('Predicted')
|
||||
plt.ylabel('Actual')
|
||||
plt.title('Confusion Matrix')
|
||||
plt.show()
|
||||
save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model)
|
||||
|
||||
|
||||
def draw_roc_curve(X_test, y_pred_prob, model):
|
||||
# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
|
||||
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
|
||||
|
||||
# Calcul de l'aire sous la courbe ROC (AUC)
|
||||
roc_auc = auc(fpr, tpr)
|
||||
|
||||
plt.figure(figsize = (14, 8))
|
||||
plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
|
||||
plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
|
||||
plt.grid(color='gray', linestyle='--', linewidth=0.5)
|
||||
plt.xlabel("False Positive Rate")
|
||||
plt.ylabel("True Positive Rate")
|
||||
plt.title("ROC Curve", size=18)
|
||||
plt.legend(loc="lower right")
|
||||
plt.show()
|
||||
save_file_s3("Roc_curve_", type_of_activity, type_of_model, model)
|
||||
|
||||
|
||||
def draw_calibration_curve(X_test, y_pred_prob, model):
|
||||
frac_pos, mean_pred = calibration_curve(y_test, y_pred_prob, n_bins=10)
|
||||
|
||||
# Plot the calibration curve
|
||||
plt.plot(mean_pred, frac_pos, 's-', label=model)
|
||||
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
|
||||
plt.xlabel('Mean predicted value')
|
||||
plt.ylabel('Fraction of positive predictions')
|
||||
plt.title("Calibration Curve")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
save_file_s3("Calib_curve_", type_of_activity, type_of_model, model)
|
||||
|
||||
|
||||
def draw_features_importance(pipeline, model, randomF = False):
|
||||
if randomF:
|
||||
coefficients = pipeline.named_steps[model].feature_importances_
|
||||
else:
|
||||
coefficients = pipeline.named_steps[model].coef_[0]
|
||||
|
||||
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
|
||||
# Tracer l'importance des caractéristiques
|
||||
plt.figure(figsize=(12, 8))
|
||||
plt.barh(feature_names, coefficients, color='skyblue')
|
||||
plt.xlabel("Features' Importance")
|
||||
plt.ylabel('Caractéristiques')
|
||||
plt.title("Features' Importance")
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
save_file_s3("Features_", type_of_activity, type_of_model, model)
|
||||
|
||||
|
||||
def draw_prob_distribution(y_pred_prob, model):
|
||||
plt.figure(figsize=(10, 8))
|
||||
plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
|
||||
|
||||
plt.xlim(0, 1)
|
||||
plt.ylim(0, None)
|
||||
|
||||
plt.title('Histogramme des probabilités pour la classe 1')
|
||||
plt.xlabel('Probability')
|
||||
plt.ylabel('Frequency')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
save_file_s3("prob_dist_", type_of_activity, type_of_model, model)
|
||||
|
||||
|
||||
def draw_prob_distribution_companies(y_pred_prob, model):
|
||||
test = dataset_test.copy()
|
||||
test['probability to buy'] = y_pred_prob
|
||||
test['company'] = test['customer_id'].str.split('_', expand=True)[0]
|
||||
sns.histplot(data=test, x='probability to buy', hue='company', element='step',
|
||||
stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
|
||||
plt.xlim(0, 1)
|
||||
plt.ylim(0, None)
|
||||
plt.title('Histogram of probabilities for class 1 by company')
|
||||
plt.xlabel('Probability')
|
||||
plt.ylabel('Frequency')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model)
|
||||
|
||||
|
||||
|
||||
def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||
pipeline = Pipeline(steps=[
|
||||
('preprocessor', preproc),
|
||||
('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
|
||||
max_iter=5000, n_jobs=-1))
|
||||
])
|
||||
pipeline.fit(X_train, y_train)
|
||||
|
||||
y_pred = pipeline.predict(X_test)
|
||||
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
||||
|
||||
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||
model = "LogisticRegression_Benchmark"
|
||||
result = pd.DataFrame({"Model" : [model],
|
||||
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||
"Recall" : [recall_score(y_test, y_pred)],
|
||||
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||
"AUC" : [auc(fpr, tpr)]}
|
||||
)
|
||||
model_result = pd.concat([model_result, result])
|
||||
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||
|
||||
draw_confusion_matrix(y_test, y_pred, model)
|
||||
draw_roc_curve(X_test, y_pred_prob, model)
|
||||
draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
|
||||
draw_prob_distribution(y_pred_prob, model)
|
||||
draw_prob_distribution_companies(y_pred_prob, model)
|
||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||
save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||
return model_result
|
||||
|
||||
|
||||
def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
|
||||
y_train = y_train['y_has_purchased']
|
||||
param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
|
||||
'LogisticRegression_cv__penalty': ['l1', 'l2'],
|
||||
'LogisticRegression_cv__class_weight': ['balanced', weight_dict]}
|
||||
pipeline = Pipeline(steps=[
|
||||
('preprocessor', preproc),
|
||||
('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))
|
||||
])
|
||||
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
|
||||
n_jobs=-1)
|
||||
|
||||
grid_search.fit(X_train, y_train)
|
||||
y_pred = grid_search.predict(X_test)
|
||||
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
|
||||
best_pipeline = grid_search.best_estimator_
|
||||
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||
model = "LogisticRegression_cv"
|
||||
result = pd.DataFrame({"Model" : [model],
|
||||
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||
"Recall" : [recall_score(y_test, y_pred)],
|
||||
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||
"AUC" : [auc(fpr, tpr)]}
|
||||
)
|
||||
model_result = pd.concat([model_result, result])
|
||||
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||
|
||||
draw_confusion_matrix(y_test, y_pred, model)
|
||||
draw_roc_curve(X_test, y_pred_prob, model)
|
||||
draw_features_importance(best_pipeline, 'LogisticRegression_cv')
|
||||
draw_prob_distribution(y_pred_prob, model)
|
||||
draw_prob_distribution_companies(y_pred_prob, model)
|
||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||
save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search)
|
||||
return model_result
|
||||
|
||||
|
||||
def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||
pipeline = Pipeline(steps=[
|
||||
('preprocessor', preproc),
|
||||
('randomF', RandomForestClassifier(class_weight = weight_dict,
|
||||
n_jobs=-1))
|
||||
])
|
||||
pipeline.fit(X_train, y_train)
|
||||
|
||||
y_pred = pipeline.predict(X_test)
|
||||
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
||||
|
||||
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||
model = "randomF"
|
||||
result = pd.DataFrame({"Model" : [model],
|
||||
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||
"Recall" : [recall_score(y_test, y_pred)],
|
||||
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||
"AUC" : [auc(fpr, tpr)]}
|
||||
)
|
||||
model_result = pd.concat([model_result, result])
|
||||
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||
|
||||
draw_confusion_matrix(y_test, y_pred, model)
|
||||
draw_roc_curve(X_test, y_pred_prob, model)
|
||||
draw_features_importance(pipeline, 'randomF', randomF=True)
|
||||
draw_prob_distribution(y_pred_prob, model)
|
||||
draw_prob_distribution_companies(y_pred_prob, model)
|
||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||
save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||
return model_result
|
||||
|
||||
|
||||
def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
|
||||
y_train = y_train['y_has_purchased']
|
||||
param_grid = {
|
||||
'randomF_cv__n_estimators': [100, 300],
|
||||
'randomF_cv__max_features': ['sqrt', 'log2'],
|
||||
'randomF_cv__min_samples_split': [2, 10],
|
||||
'randomF_cv__min_samples_leaf': [1, 4],
|
||||
'randomF_cv__class_weight': [weight_dict]
|
||||
}
|
||||
pipeline = Pipeline(steps=[
|
||||
('preprocessor', preproc),
|
||||
('randomF_cv', RandomForestClassifier(n_jobs=-1))
|
||||
])
|
||||
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
|
||||
n_jobs=-1)
|
||||
|
||||
grid_search.fit(X_train, y_train)
|
||||
y_pred = grid_search.predict(X_test)
|
||||
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
|
||||
best_pipeline = grid_search.best_estimator_
|
||||
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||
model = "randomF_cv"
|
||||
result = pd.DataFrame({"Model" : [model],
|
||||
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||
"Recall" : [recall_score(y_test, y_pred)],
|
||||
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||
"AUC" : [auc(fpr, tpr)]}
|
||||
)
|
||||
model_result = pd.concat([model_result, result])
|
||||
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||
|
||||
draw_confusion_matrix(y_test, y_pred, model)
|
||||
draw_roc_curve(X_test, y_pred_prob, model)
|
||||
draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
|
||||
draw_prob_distribution(y_pred_prob, model)
|
||||
draw_prob_distribution_companies(y_pred_prob, model)
|
||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||
save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search)
|
||||
return model_result
|
||||
|
||||
|
||||
def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||
unique_classes, counts = np.unique(y_train, return_counts=True)
|
||||
class_priors = counts / counts.sum()
|
||||
pipeline = Pipeline(steps=[
|
||||
('preprocessor', preproc),
|
||||
('Naive_Bayes', GaussianNB(priors=class_priors))
|
||||
])
|
||||
pipeline.fit(X_train, y_train)
|
||||
|
||||
y_pred = pipeline.predict(X_test)
|
||||
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
|
||||
|
||||
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
|
||||
model = "Naive_Bayes"
|
||||
result = pd.DataFrame({"Model" : [model],
|
||||
"Accuracy" : [accuracy_score(y_test, y_pred)],
|
||||
"Recall" : [recall_score(y_test, y_pred)],
|
||||
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
|
||||
"AUC" : [auc(fpr, tpr)]}
|
||||
)
|
||||
model_result = pd.concat([model_result, result])
|
||||
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||
|
||||
draw_confusion_matrix(y_test, y_pred, model)
|
||||
draw_roc_curve(X_test, y_pred_prob, model)
|
||||
draw_prob_distribution(y_pred_prob, model)
|
||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||
save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||
return model_result
|
27
utils_segmentation.py
Normal file
27
utils_segmentation.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import io
|
||||
import s3fs
|
||||
import re
|
||||
import pickle
|
||||
import warnings
|
||||
|
||||
|
||||
def load_model(type_of_activity, model):
|
||||
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
|
||||
filename = model + '.pkl'
|
||||
file_path = BUCKET + filename
|
||||
with fs.open(file_path, mode="rb") as f:
|
||||
model_bytes = f.read()
|
||||
|
||||
model = pickle.loads(model_bytes)
|
||||
return model
|
||||
|
||||
|
||||
def load_test_file(type_of_activity):
|
||||
file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
|
||||
with fs.open(file_path_test, mode="rb") as file_in:
|
||||
dataset_test = pd.read_csv(file_in, sep=",")
|
||||
return dataset_test
|
||||
|
438
utils_stat_desc.py
Normal file
438
utils_stat_desc.py
Normal file
|
@ -0,0 +1,438 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
import s3fs
|
||||
import io
|
||||
import warnings
|
||||
from datetime import date, timedelta, datetime
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.dates as mdates
|
||||
import seaborn as sns
|
||||
|
||||
|
||||
def load_files(nb_compagnie):
|
||||
customer = pd.DataFrame()
|
||||
campaigns_brut = pd.DataFrame()
|
||||
campaigns_kpi = pd.DataFrame()
|
||||
products = pd.DataFrame()
|
||||
tickets = pd.DataFrame()
|
||||
targets = pd.DataFrame()
|
||||
|
||||
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
|
||||
for directory_path in nb_compagnie:
|
||||
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
||||
df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
||||
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
||||
df_target_information = display_input_databases(directory_path, file_name = "target_information")
|
||||
|
||||
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC'))
|
||||
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
|
||||
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
||||
df_target_KPI = targets_KPI(df_target = df_target_information)
|
||||
|
||||
# Merge and
|
||||
df_target_KPI = pd.merge(df_customerplus_clean_0[['customer_id']], df_target_KPI, how = 'left', on = 'customer_id')
|
||||
targets_columns = list(df_target_KPI.columns)
|
||||
targets_columns.remove('customer_id')
|
||||
df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
|
||||
|
||||
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
|
||||
df_tickets_kpi["number_company"]=int(directory_path)
|
||||
df_campaigns_brut["number_company"]=int(directory_path)
|
||||
df_campaigns_kpi["number_company"]=int(directory_path)
|
||||
df_customerplus_clean["number_company"]=int(directory_path)
|
||||
df_target_information["number_company"]=int(directory_path)
|
||||
df_target_KPI["number_company"]=int(directory_path)
|
||||
|
||||
# Traitement des index
|
||||
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
|
||||
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
|
||||
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
|
||||
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
|
||||
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
|
||||
<<<<<<< HEAD
|
||||
|
||||
# Remove companies' outliers
|
||||
df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
|
||||
# harmonize set of customers across databases
|
||||
customer_id = df_tickets_kpi['customer_id'].to_list()
|
||||
for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
|
||||
dataset = dataset[dataset['customer_id'].isin(customer_id)]
|
||||
|
||||
=======
|
||||
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
|
||||
|
||||
|
||||
>>>>>>> main
|
||||
# Concaténation
|
||||
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
|
||||
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
|
||||
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
|
||||
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
|
||||
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
|
||||
targets = pd.concat([targets, df_target_KPI], ignore_index=True)
|
||||
|
||||
return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
|
||||
|
||||
|
||||
def remove_outlier_total_amount(tickets):
|
||||
Q1 = tickets['total_amount'].quantile(0.25)
|
||||
Q3 = tickets['total_amount'].quantile(0.75)
|
||||
IQR = Q3 - Q1
|
||||
upper = Q3 +1.5*IQR
|
||||
outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
|
||||
tickets = tickets[~tickets['customer_id'].isin(outliers)]
|
||||
return tickets
|
||||
|
||||
|
||||
def save_file_s3(File_name, type_of_activity):
|
||||
image_buffer = io.BytesIO()
|
||||
plt.savefig(image_buffer, format='png')
|
||||
image_buffer.seek(0)
|
||||
FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
|
||||
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
|
||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||
s3_file.write(image_buffer.read())
|
||||
plt.close()
|
||||
|
||||
|
||||
def outlier_detection(tickets, company_list, show_diagram=False):
|
||||
|
||||
outlier_list = list()
|
||||
|
||||
for company in company_list:
|
||||
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
|
||||
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
|
||||
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
|
||||
|
||||
total_amount_share_index = total_amount_share.set_index('customer_id')
|
||||
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
|
||||
#print('df circulaire : ', df_circulaire.head())
|
||||
top = df_circulaire[:1]
|
||||
#print('top : ', top)
|
||||
outlier_list.append(top.index[0])
|
||||
rest = df_circulaire[1:]
|
||||
|
||||
rest_sum = rest.sum()
|
||||
|
||||
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
|
||||
|
||||
if show_diagram:
|
||||
plt.figure(figsize=(3, 3))
|
||||
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
|
||||
plt.axis('equal')
|
||||
plt.title(f'Répartition des montants totaux pour la compagnie {company}')
|
||||
plt.show()
|
||||
return outlier_list
|
||||
|
||||
|
||||
def valid_customer_detection(products, campaigns_brut):
|
||||
products_valid = products[products['purchase_date']>="2021-05-01"]
|
||||
consumer_valid_product = products_valid['customer_id'].to_list()
|
||||
|
||||
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
|
||||
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
|
||||
|
||||
consumer_valid = consumer_valid_product + consumer_valid_campaigns
|
||||
return consumer_valid
|
||||
|
||||
|
||||
def identify_purchase_during_target_periode(products):
|
||||
products_target_period = products[(products['purchase_date']>="2022-11-01")
|
||||
& (products['purchase_date']<="2023-11-01")]
|
||||
customer_target_period = products_target_period['customer_id'].to_list()
|
||||
return customer_target_period
|
||||
|
||||
|
||||
def remove_elements(lst, elements_to_remove):
|
||||
return ''.join([x for x in lst if x not in elements_to_remove])
|
||||
|
||||
|
||||
def compute_nb_clients(customer, type_of_activity):
|
||||
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
|
||||
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
|
||||
|
||||
plt.xlabel('Company')
|
||||
plt.ylabel("Number of clients (thousands)")
|
||||
plt.title(f"Number of clients Across {type_of_activity} Companies")
|
||||
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("nb_clients_", type_of_activity)
|
||||
|
||||
|
||||
def maximum_price_paid(customer, type_of_activity):
|
||||
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
|
||||
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
|
||||
|
||||
plt.xlabel('Company Number')
|
||||
plt.ylabel("Maximal price of a ticket Prix")
|
||||
plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies")
|
||||
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("Maximal_price_", type_of_activity)
|
||||
|
||||
|
||||
def target_proportion(customer, type_of_activity):
|
||||
df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
|
||||
'customer_id' : 'nunique'}).reset_index()
|
||||
df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
|
||||
plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
|
||||
plt.xlabel('Company Number')
|
||||
plt.ylabel('Share (%)')
|
||||
plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
|
||||
plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("share_target_", type_of_activity)
|
||||
|
||||
|
||||
def mailing_consent(customer, type_of_activity):
|
||||
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
|
||||
mailing_consent["opt_in"] *= 100
|
||||
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
|
||||
|
||||
plt.xlabel('Company Number')
|
||||
plt.ylabel('Mailing Consent (%)')
|
||||
plt.title(f'Consent of mailing Across {type_of_activity} Companies')
|
||||
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("mailing_consent_", type_of_activity)
|
||||
|
||||
|
||||
def mailing_consent_by_target(customer):
|
||||
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
|
||||
# Création du barplot groupé
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
categories = df_graph["number_company"].unique()
|
||||
bar_width = 0.35
|
||||
bar_positions = np.arange(len(categories))
|
||||
|
||||
# Grouper les données par label et créer les barres groupées
|
||||
for label in df_graph["has_purchased_target_period"].unique():
|
||||
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
|
||||
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
||||
|
||||
label_printed = "Purchase" if label else "No purchase"
|
||||
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
||||
|
||||
# Mise à jour des positions des barres pour le prochain groupe
|
||||
bar_positions = [pos + bar_width for pos in bar_positions]
|
||||
|
||||
# Ajout des étiquettes, de la légende, etc.
|
||||
ax.set_xlabel('Company Number')
|
||||
ax.set_ylabel('Mailing Consent (%)')
|
||||
ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies')
|
||||
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
||||
ax.set_xticklabels(categories)
|
||||
ax.legend()
|
||||
|
||||
# Affichage du plot
|
||||
plt.show()
|
||||
save_file_s3("mailing_consent_target_", type_of_activity)
|
||||
|
||||
|
||||
def gender_bar(customer, type_of_activity):
|
||||
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
|
||||
|
||||
company_genders["gender_male"] *= 100
|
||||
company_genders["gender_female"] *= 100
|
||||
company_genders["gender_other"] *= 100
|
||||
|
||||
# Création du barplot
|
||||
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male")
|
||||
plt.bar(company_genders["number_company"], company_genders["gender_female"],
|
||||
bottom = company_genders["gender_male"], label = "Female")
|
||||
plt.bar(company_genders["number_company"], company_genders["gender_other"],
|
||||
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown")
|
||||
|
||||
plt.xlabel('Company Number')
|
||||
plt.ylabel("Frequency (%)")
|
||||
plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies")
|
||||
plt.legend()
|
||||
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("gender_bar_", type_of_activity)
|
||||
|
||||
|
||||
def country_bar(customer, type_of_activity):
|
||||
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
|
||||
company_country_fr["country_fr"] *= 100
|
||||
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
|
||||
|
||||
plt.xlabel('Company Number')
|
||||
plt.ylabel("Share of French Customer (%)")
|
||||
plt.title(f"Share of French Customer Across {type_of_activity} Companies")
|
||||
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("country_bar_", type_of_activity)
|
||||
|
||||
|
||||
def lazy_customer_plot(campaigns_kpi, type_of_activity):
|
||||
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
|
||||
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
|
||||
|
||||
plt.xlabel('Company Number')
|
||||
plt.title(f"Share of Customers who did not Open Mail Across {type_of_activity} Companies")
|
||||
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("lazy_customer_", type_of_activity)
|
||||
|
||||
|
||||
def campaigns_effectiveness(customer, type_of_activity):
|
||||
|
||||
campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
categories = campaigns_effectiveness["number_company"].unique()
|
||||
bar_width = 0.35
|
||||
bar_positions = np.arange(len(categories))
|
||||
|
||||
# Grouper les données par label et créer les barres groupées
|
||||
for label in campaigns_effectiveness["has_purchased_target_period"].unique():
|
||||
label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
|
||||
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
||||
|
||||
label_printed = "Purchase" if label else "No purchase"
|
||||
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
||||
|
||||
# Mise à jour des positions des barres pour le prochain groupe
|
||||
bar_positions = [pos + bar_width for pos in bar_positions]
|
||||
|
||||
# Ajout des étiquettes, de la légende, etc.
|
||||
ax.set_xlabel('Company Number')
|
||||
ax.set_ylabel('Share of Consent (%)')
|
||||
ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)")
|
||||
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
||||
ax.set_xticklabels(categories)
|
||||
ax.legend()
|
||||
plt.show()
|
||||
save_file_s3("campaigns_effectiveness_", type_of_activity)
|
||||
|
||||
|
||||
def sale_dynamics(products, campaigns_brut, type_of_activity):
|
||||
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
|
||||
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
|
||||
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
|
||||
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
|
||||
|
||||
# Mois du premier mails
|
||||
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
|
||||
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
|
||||
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
|
||||
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
|
||||
|
||||
# Fusion
|
||||
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
|
||||
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
|
||||
|
||||
# Mois à partir duquel le client est considere comme connu
|
||||
|
||||
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
|
||||
|
||||
# Nombre de commande par mois
|
||||
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
|
||||
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
|
||||
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
|
||||
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
|
||||
|
||||
# Nombre de commande par mois par type de client
|
||||
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
|
||||
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
|
||||
|
||||
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
|
||||
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
|
||||
|
||||
# Graphique en nombre de commande
|
||||
purchases_graph = nb_purchases_graph
|
||||
|
||||
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
|
||||
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
|
||||
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
|
||||
|
||||
|
||||
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
|
||||
|
||||
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers")
|
||||
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
|
||||
bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers")
|
||||
|
||||
|
||||
# commande pr afficher slt
|
||||
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
|
||||
|
||||
plt.xlabel('Month')
|
||||
plt.ylabel("Number of Sales")
|
||||
plt.title(f"Number of Sales Across {type_of_activity} Companies")
|
||||
plt.legend()
|
||||
plt.show()
|
||||
save_file_s3("sale_dynamics_", type_of_activity)
|
||||
|
||||
|
||||
def tickets_internet(tickets, type_of_activity):
|
||||
nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
|
||||
nb_tickets_internet['prop_purchases_internet'] *=100
|
||||
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"])
|
||||
|
||||
plt.xlabel('Company Number')
|
||||
plt.ylabel("Share of Purchases Bought Online (%)")
|
||||
plt.title(f"Share of Online Purchases Across {type_of_activity} Companies")
|
||||
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("tickets_internet_", type_of_activity)
|
||||
|
||||
|
||||
def already_bought_online(tickets, type_of_activity):
|
||||
nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
|
||||
'customer_id' : 'nunique'}
|
||||
).reset_index())
|
||||
nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100
|
||||
|
||||
plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"])
|
||||
|
||||
plt.xlabel('Company Number')
|
||||
plt.ylabel("Share of Customer who Bought Online at least once (%)")
|
||||
plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies")
|
||||
plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
|
||||
plt.show()
|
||||
save_file_s3("First_buy_internet_", type_of_activity)
|
||||
|
||||
|
||||
def box_plot_price_tickets(tickets, type_of_activity):
|
||||
price_tickets = tickets[(tickets['total_amount'] > 0)]
|
||||
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
|
||||
plt.title(f"Box plot of price tickets Across {type_of_activity} Companies")
|
||||
plt.show()
|
||||
save_file_s3("box_plot_price_tickets_", type_of_activity)
|
||||
|
||||
def target_description(targets, type_of_activity):
|
||||
|
||||
describe_target = targets.groupby('number_company').agg(
|
||||
prop_target_jeune=('target_jeune', lambda x: (x.sum() / x.count())*100),
|
||||
prop_target_scolaire=('target_scolaire', lambda x: (x.sum() / x.count())*100),
|
||||
prop_target_entreprise=('target_entreprise', lambda x: (x.sum() / x.count())*100),
|
||||
prop_target_famille=('target_famille', lambda x: (x.sum() / x.count())*100),
|
||||
prop_target_optin=('target_optin', lambda x: (x.sum() / x.count())*100),
|
||||
prop_target_optout=('target_optout', lambda x: (x.sum() / x.count())*100),
|
||||
prop_target_newsletter=('target_newsletter', lambda x: (x.sum() / x.count())*100),
|
||||
prop_target_abonne=('target_abonne', lambda x: (x.sum() / x.count())*100))
|
||||
|
||||
plot = describe_target.plot.bar()
|
||||
|
||||
# Adding a title
|
||||
plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies")
|
||||
|
||||
# Adding labels for x and y axes
|
||||
plot.set_xlabel("Company Number")
|
||||
plot.set_ylabel("Target Proportion")
|
||||
|
||||
plot.set_xticklabels(plot.get_xticklabels(), rotation=0, horizontalalignment='center')
|
||||
|
||||
|
||||
# Adding a legend
|
||||
plot.legend(["Youth", "School", "Enterprise", "Family", "Optin", "Optout", "Newsletter", "Subscriber"], title="Target Category")
|
||||
|
||||
save_file_s3("target_category_proportion_", type_of_activity)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user