Merge branch 'main' into segment_value

This commit is contained in:
Antoine JOUBREL 2024-03-28 20:44:06 +00:00
commit 3d6414728c
19 changed files with 2908 additions and 21144 deletions

View File

@ -1,5 +1,8 @@
# Business Data Challenge - Team 1
# Purpose of the script : Construction of training and test datasets for modelling by company
# Input : KPI construction function and clean databases in the 0_Input folder
# Output : Train and test datasets by compagnies
# Packages
import pandas as pd
import numpy as np
import os
@ -9,12 +12,10 @@ import warnings
from datetime import date, timedelta, datetime
from sklearn.model_selection import train_test_split
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Import KPI construction functions
exec(open('0_KPI_functions.py').read())
@ -24,53 +25,69 @@ warnings.filterwarnings('ignore')
def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Import customerplus
# Import of cleaned and merged datasets
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_input_databases(directory_path, file_name = "target_information")
# if directory_path == "101":
# df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
# df_products_purchased_reduced = pd.concat([df_products_purchased_reduced, df_products_purchased_reduced_1])
# Filtre de cohérence pour la mise en pratique de notre méthode
# Dates in datetime format
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
#Filtre de la base df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
# Filter for database df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
#Filtre de la base df_products_purchased_reduced
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
# Filter for database df_products_purchased_reduced
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
print("Data filtering : SUCCESS")
# Fusion de l'ensemble et creation des KPI
# Building and merging features
# KPI sur les campagnes publicitaires
# Campaigns features
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
# KPI sur le comportement d'achat
# Purchasing behavior features
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
# KPI sur les données socio-démographiques
# Socio-demographic features
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
# Targets features
df_targets_kpi = targets_KPI(df_target = df_target_information)
print("KPIs construction : SUCCESS")
# Fusion avec KPI liés au customer
# Merge - campaigns features
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
# Fusion avec KPI liés au comportement d'achat
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
# Merge - targets features
df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
targets_columns = list(df_targets_kpi.columns)
targets_columns.remove('customer_id')
df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
# We standardise the number of targets closely linked to the company's operations
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
# Merge - purchasing behavior features
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']
simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
@ -82,9 +99,9 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
print("Explanatory variable construction : SUCCESS")
# 2. Construction of the explained variable
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
# Indicatrice d'achat
# Construction of the dependant variable
df_products_purchased_to_predict['y_has_purchased'] = 1
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
@ -103,28 +120,24 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
return dataset
## Exportation
# Sectors
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5', '6', '7', '8', '9'],
'musique' : ['10', '11', '12', '13', '14']}
# Choosed sector
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
list_of_comp = companies[type_of_comp]
# Dossier d'exportation
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'
# Create test dataset and train dataset for sport companies
#start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_features = 0.7)
# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)
# Export folder
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}'
# Dates used for the construction of features and the dependant variable
start_date = "2021-05-01"
end_of_features = "2022-11-01"
final_date = "2023-11-01"
# Anonymous customer to be deleted from the datasets
anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
'5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
'10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
@ -133,33 +146,23 @@ for company in list_of_comp:
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
max_date = final_date, directory_path = company)
# On retire le client anonyme
# Deletion of the anonymous customer
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
# #train test set
# np.random.seed(42)
# split_ratio = 0.7
# split_index = int(len(dataset) * split_ratio)
# dataset = dataset.sample(frac=1).reset_index(drop=True)
# dataset_train = dataset.iloc[:split_index]
# dataset_test = dataset.iloc[split_index:]
# Split between train and test
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
# Dataset Test
# Exportation
# Export
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_test.to_csv(file_out, index = False)
print("Exportation dataset test : SUCCESS")
print("Export of dataset test : SUCCESS")
# Dataset train
# Export
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
@ -167,7 +170,7 @@ for company in list_of_comp:
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_train.to_csv(file_out, index = False)
print("Exportation dataset train : SUCCESS")
print("Export of dataset train : SUCCESS")
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")

View File

@ -21,7 +21,7 @@ warnings.filterwarnings('ignore')
# functions
def generate_test_set(type_of_comp):
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Test_set")
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Test_set")
test_set = pd.DataFrame()
for file in file_path_list:
print(file)
@ -32,7 +32,7 @@ def generate_test_set(type_of_comp):
def generate_train_set(type_of_comp):
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Train_set")
file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization_v2/{type_of_comp}/Train_set")
train_set = pd.DataFrame()
for file in file_path_list:
print(file)
@ -43,7 +43,7 @@ def generate_train_set(type_of_comp):
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/'
BUCKET_OUT = f'projet-bdc2324-team1/Generalization_v2/{type_of_comp}/'
# create test and train datasets
test_set = generate_test_set(type_of_comp)

74
0_4_Generate_stat_desc.py Normal file
View File

@ -0,0 +1,74 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import warnings
# Ignore warning
warnings.filterwarnings('ignore')
exec(open('0_KPI_functions.py').read())
exec(open('utils_stat_desc.py').read())
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5', '6', '7', '8', '9'],
'musique' : ['10', '11', '12', '13', '14']}
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
list_of_comp = companies[type_of_activity]
# Load files
customer, campaigns_kpi, campaigns_brut, tickets, products, targets = load_files(list_of_comp)
# Identify anonymous customer for each company and remove them from our datasets
outlier_list = outlier_detection(tickets, list_of_comp)
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
customer_valid_list = valid_customer_detection(products, campaigns_brut)
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
for dataset in databases:
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
#print(f'shape of {dataset} : ', dataset.shape)
# Identify customer who bought during the period of y
customer_target_period = identify_purchase_during_target_periode(products)
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
# Generate graph and automatically saved them in the bucket
compute_nb_clients(customer, type_of_activity)
#maximum_price_paid(customer, type_of_activity)
target_proportion(customer, type_of_activity)
mailing_consent(customer, type_of_activity)
mailing_consent_by_target(customer)
gender_bar(customer, type_of_activity)
country_bar(customer, type_of_activity)
lazy_customer_plot(campaigns_kpi, type_of_activity)
campaigns_effectiveness(customer, type_of_activity)
sale_dynamics(products, campaigns_brut, type_of_activity)
tickets_internet(tickets, type_of_activity)
already_bought_online(tickets, type_of_activity)
box_plot_price_tickets(tickets, type_of_activity)
target_description(targets, type_of_activity)

87
0_5_Machine_Learning.py Normal file
View File

@ -0,0 +1,87 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
import pickle
import warnings
exec(open('utils_ml.py').read())
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)
# choose the type of companies for which you want to run the pipeline
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
# choose the type of model
type_of_model = input('Choisissez le type de model : basique ? premium ?')
# load train and test set
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
dataset_train, dataset_test = load_train_test(type_of_activity, type_of_model)
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
print("Shape train : ", X_train.shape)
print("Shape test : ", X_test.shape)
# processing
weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
y = y_train['y_has_purchased'])
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
preproc = preprocess(type_of_model, type_of_activity)
# Object for storing results
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
# Naive Bayes
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Naive Bayes : Done")
# Logistic Regression
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
print("Logistic : Done")
model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Logistic CV : Done")
# Random Forest
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Random Forest : Done")
model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Random Forest CV: Done")
# Save result
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)

40
0_6_Segmentation.py Normal file
View File

@ -0,0 +1,40 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
exec(open('utils_segmentation.py').read())
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# choose the type of companies for which you want to run the pipeline
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
# load test set
dataset_test = load_test_file(type_of_activity)
# Load Model
model = load_model(type_of_activity, 'LogisticRegression_Benchmark')
# Processing
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
y_test = dataset_test[['y_has_purchased']]
# Prediction
y_pred_prob = model.predict_proba(X_test)[:, 1]
# Add probability to dataset_test
dataset_test['Probability_to_buy'] = y_pred_prob
print('probability added to dataset_test')
print(dataset_test.head())

View File

@ -74,7 +74,7 @@ def preprocessing_customerplus(directory_path):
cleaning_date(customerplus_copy, 'last_visiting_date')
# Selection des variables
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at'], axis = 1, inplace=True)
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
return customerplus_copy

View File

@ -13,14 +13,14 @@ def display_input_databases(directory_path, file_name, datetime_col = None):
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
return df
def campaigns_kpi_function(campaigns_information = None, max_date = None):
def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
# Nombre de campagnes de mails
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture moyen (en minutes)
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
@ -44,7 +44,6 @@ def campaigns_kpi_function(campaigns_information = None, max_date = None):
return campaigns_reduced
def tickets_kpi_function(tickets_information = None):
tickets_information_copy = tickets_information.copy()
@ -100,6 +99,8 @@ def customerplus_kpi_function(customerplus_clean = None):
})
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
customerplus_clean.drop(columns = "gender", inplace = True)
# Age
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
@ -112,19 +113,53 @@ def customerplus_kpi_function(customerplus_clean = None):
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
# customerplus_clean.drop(columns = "age", inplace = True)
# Consentement au mailing
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
# Indicatrice si individue vit en France
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
# customerplus_clean.drop(columns = "country", inplace = True)
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
# customerplus_clean.drop(columns = "profession", inplace = True)
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
# customerplus_clean.drop(columns = "zipcode", inplace = True)
# Dummy if the customer has a structure id (tags)
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
return customerplus_clean
def targets_KPI(df_target = None):
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
# Target name cotegory musees /
df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
# Target name category for sport compagnies
df_target['target_abonne'] = ((
df_target['target_name']
.str.contains('|'.join(['abo', 'adh']), case=False)
& ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
).astype(int))
df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
target_agg = df_target.groupby('customer_id').agg(
nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes
# all_targets=('target_name', concatenate_names),
# all_target_types=('target_type_name', concatenate_names)
).reset_index()
target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
return target_agg

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,68 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import warnings
# Ignore warning
warnings.filterwarnings('ignore')
exec(open('../0_KPI_functions.py').read())
exec(open('plot.py').read())
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5'],
'musique' : ['10', '11', '12', '13', '14']}
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
list_of_comp = companies[type_of_activity]
# Load files
customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
# Identify anonymous customer for each company and remove them from our datasets
outlier_list = outlier_detection(tickets, list_of_comp)
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
customer_valid_list = valid_customer_detection(products, campaigns_brut)
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
for dataset in databases:
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
#print(f'shape of {dataset} : ', dataset.shape)
# Identify customer who bought during the period of y
customer_target_period = identify_purchase_during_target_periode(products)
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
# Generate graph and automatically saved them in the bucket
compute_nb_clients(customer, type_of_activity)
maximum_price_paid(customer, type_of_activity)
mailing_consent(customer, type_of_activity)
mailing_consent_by_target(customer)
gender_bar(customer, type_of_activity)
country_bar(customer, type_of_activity)
lazy_customer_plot(campaigns_kpi, type_of_activity)
#campaigns_effectiveness(customer, type_of_activity)
sale_dynamics(products, campaigns_brut, type_of_activity)
tickets_internet(tickets, type_of_activity)
box_plot_price_tickets(tickets, type_of_activity)

View File

@ -0,0 +1,328 @@
import pandas as pd
import os
import s3fs
import io
import warnings
from datetime import date, timedelta, datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
def load_files(nb_compagnie):
customer = pd.DataFrame()
campaigns_brut = pd.DataFrame()
campaigns_kpi = pd.DataFrame()
products = pd.DataFrame()
tickets = pd.DataFrame()
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
for directory_path in nb_compagnie:
df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_databases(directory_path, file_name = "target_information")
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut)
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
df_tickets_kpi["number_company"]=int(directory_path)
df_campaigns_brut["number_company"]=int(directory_path)
df_campaigns_kpi["number_company"]=int(directory_path)
df_customerplus_clean["number_company"]=int(directory_path)
df_target_information["number_company"]=int(directory_path)
# Traitement des index
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
# Concaténation
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
return customer, campaigns_kpi, campaigns_brut, tickets, products
def save_file_s3(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def outlier_detection(tickets, company_list, show_diagram=False):
outlier_list = list()
for company in company_list:
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
total_amount_share_index = total_amount_share.set_index('customer_id')
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
#print('df circulaire : ', df_circulaire.head())
top = df_circulaire[:1]
#print('top : ', top)
outlier_list.append(top.index[0])
rest = df_circulaire[1:]
rest_sum = rest.sum()
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
if show_diagram:
plt.figure(figsize=(3, 3))
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
plt.axis('equal')
plt.title(f'Répartition des montants totaux pour la compagnie {company}')
plt.show()
return outlier_list
def valid_customer_detection(products, campaigns_brut):
products_valid = products[products['purchase_date']>="2021-05-01"]
consumer_valid_product = products_valid['customer_id'].to_list()
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
consumer_valid = consumer_valid_product + consumer_valid_campaigns
return consumer_valid
def identify_purchase_during_target_periode(products):
products_target_period = products[(products['purchase_date']>="2022-11-01")
& (products['purchase_date']<="2023-11-01")]
customer_target_period = products_target_period['customer_id'].to_list()
return customer_target_period
def remove_elements(lst, elements_to_remove):
return ''.join([x for x in lst if x not in elements_to_remove])
def compute_nb_clients(customer, type_of_activity):
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
plt.xlabel('Company')
plt.ylabel("Number of clients (thousands)")
plt.title(f"Number of clients for {type_of_activity}")
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
plt.show()
save_file_s3("nb_clients_", type_of_activity)
def maximum_price_paid(customer, type_of_activity):
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
plt.xlabel('Company')
plt.ylabel("Maximal price of a ticket Prix")
plt.title(f"Maximal price of a ticket for {type_of_activity}")
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
plt.show()
save_file_s3("Maximal_price_", type_of_activity)
def mailing_consent(customer, type_of_activity):
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
plt.xlabel('Company')
plt.ylabel('Consent')
plt.title(f'Consent of mailing for {type_of_activity}')
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
plt.show()
save_file_s3("mailing_consent_", type_of_activity)
def mailing_consent_by_target(customer):
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
# Création du barplot groupé
fig, ax = plt.subplots(figsize=(10, 6))
categories = df_graph["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
# Grouper les données par label et créer les barres groupées
for label in df_graph["has_purchased_target_period"].unique():
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
label_printed = "purchased" if label else "no purchase"
ax.bar(bar_positions, values, bar_width, label=label_printed)
# Mise à jour des positions des barres pour le prochain groupe
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
ax.set_xlabel('Company')
ax.set_ylabel('Consent')
ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
# Affichage du plot
plt.show()
save_file_s3("mailing_consent_target_", type_of_activity)
def gender_bar(customer, type_of_activity):
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
# Création du barplot
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
plt.bar(company_genders["number_company"], company_genders["gender_female"],
bottom = company_genders["gender_male"], label = "Femme")
plt.bar(company_genders["number_company"], company_genders["gender_other"],
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
plt.xlabel('Company')
plt.ylabel("Gender")
plt.title(f"Gender of Customer for {type_of_activity}")
plt.legend()
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
plt.show()
save_file_s3("gender_bar_", type_of_activity)
def country_bar(customer, type_of_activity):
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
plt.xlabel('Company')
plt.ylabel("Share of French Customer")
plt.title(f"Share of French Customer for {type_of_activity}")
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
plt.show()
save_file_s3("country_bar_", type_of_activity)
def lazy_customer_plot(campaigns_kpi, type_of_activity):
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
plt.xlabel('Company')
plt.ylabel("Share of Customers who did not open mail")
plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
plt.show()
save_file_s3("lazy_customer_", type_of_activity)
def campaigns_effectiveness(customer, type_of_activity):
campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index()
plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"])
plt.xlabel('Company')
plt.ylabel("Number of Customers (thousands)")
plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
plt.legend()
plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
plt.show()
save_file_s3("campaigns_effectiveness_", type_of_activity)
def sale_dynamics(products, campaigns_brut, type_of_activity):
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
# Mois du premier mails
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
# Fusion
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
# Mois à partir duquel le client est considere comme connu
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
# Nombre de commande par mois
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
# Nombre de commande par mois par type de client
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
# Graphique en nombre de commande
purchases_graph = nb_purchases_graph
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
# commande pr afficher slt
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
plt.xlabel('Month')
plt.ylabel("Number of Sales")
plt.title(f"Number of Sales for {type_of_activity}")
plt.legend()
plt.show()
save_file_s3("sale_dynamics_", type_of_activity)
def tickets_internet(tickets, type_of_activity):
nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["Share_ticket_internet"])
plt.xlabel('Company')
plt.ylabel("Share of Tickets Bought Online")
plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
plt.show()
save_file_s3("tickets_internet_", type_of_activity)
def box_plot_price_tickets(tickets, type_of_activity):
price_tickets = tickets[(tickets['total_amount'] > 0)]
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
plt.title(f"Box plot of price tickets for {type_of_activity}")
plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
plt.show()
save_file_s3("box_plot_price_tickets_", type_of_activity)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

33
README.md Normal file
View File

@ -0,0 +1,33 @@
# Business data challenge 2023-2024 | ENSAE Paris
# Arenametrix : customer segmentation
## Team 1 :
* Antoine JOUBREL
* Alexis REVELLE
* Fanta RODRIGUE
* Thomas PIQUÉ
## Coaches :
* Elia LAPENTA
* Michael VISSER
### Description of the problematic
The goal of this project is to create segments of customers from 15 companies belonging to 3 different types of activities (sports companies, museum, and music companies).
### Our approach
We opted for a sector-based approach, which means that 3 segmentations have been performed (one for each type of activity).
As the segments have to be linked to a probability of future purchase, we directly used the probability of purchase during the incoming year to make segments. The first step of the modelization is a pipeline that fits 3 ML models (naive bayes, random forest, and logistic regression) on the data to predict whether the customer will purchase during the year. We then use the probability of purchase estimated to split the customers into 4 segments. For each segment, we can estimate the potential number of tickets and revenue for the incoming year.
### How run the code
- run 0_1_Input_cleaning.py to clean the raw data and generate dataframes that will be used to build datasets with insightful variables.
- run 0_2_Dataset_construction.py.
- run 0_3_General_modelization_dataset.py to generate test and train sets for the 3 types of activities.
- run the script 0_4_Generate_stat_desc.py to generate graphics describing the data
- run 0_5_Machine_Learning.py. 3 ML models will be fitted on the data, and results will be exported for all 3 types of activities
- run 0_6_Segmentation.py. The test set will be fitted with the optimal parameters computed previously. That will allow to compute a propensity score (probability of a future purchase). Segmentation is performed according to the scores provided. This scripts exports graphics describing the marketing personae associated to the segments as well as their business value.
- run 0_7_CA_segment.py. The scores will be adjusted to better fit the overall probability of a purchase. This score adjusted is used to estimate the number of tickets sold and the revenue generated during the incoming year.

View File

@ -65,7 +65,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 3,
"id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
"metadata": {},
"outputs": [],
@ -115,9 +115,9 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
"/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
]
}
@ -228,7 +228,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "6224fd31-c190-4168-b395-e0bf5806d79d",
"metadata": {},
"outputs": [
@ -238,7 +238,7 @@
"{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -254,7 +254,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 10,
"id": "4680f202-979e-483f-89b8-9df877203bcf",
"metadata": {},
"outputs": [
@ -265,7 +265,7 @@
" 0.54812838])"
]
},
"execution_count": 58,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -282,7 +282,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 11,
"id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
"metadata": {},
"outputs": [
@ -311,7 +311,7 @@
},
{
"cell_type": "code",
"execution_count": 258,
"execution_count": 12,
"id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
"metadata": {},
"outputs": [
@ -650,7 +650,7 @@
"[354365 rows x 17 columns]"
]
},
"execution_count": 258,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@ -662,7 +662,7 @@
},
{
"cell_type": "code",
"execution_count": 259,
"execution_count": 13,
"id": "648fb542-0186-493d-b274-be2c26a11967",
"metadata": {},
"outputs": [],
@ -677,7 +677,7 @@
},
{
"cell_type": "code",
"execution_count": 260,
"execution_count": 14,
"id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
"metadata": {},
"outputs": [
@ -1016,7 +1016,7 @@
"[354365 rows x 17 columns]"
]
},
"execution_count": 260,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -1510,12 +1510,14 @@
"\n",
"- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
"- ajouter un intercept\n",
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO "
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
"\n",
"#### A recopier dans la pipeline -> section 2 bis"
]
},
{
"cell_type": "code",
"execution_count": 266,
"execution_count": 15,
"id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
"metadata": {},
"outputs": [
@ -1817,7 +1819,7 @@
"[354365 rows x 15 columns]"
]
},
"execution_count": 266,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@ -1831,7 +1833,7 @@
},
{
"cell_type": "code",
"execution_count": 267,
"execution_count": 16,
"id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
"metadata": {},
"outputs": [
@ -1847,8 +1849,8 @@
"Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n",
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 10:07:29 Log-Likelihood: -83135.\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 07:57:46 Log-Likelihood: -83135.\n",
"converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n",
@ -1887,7 +1889,7 @@
},
{
"cell_type": "code",
"execution_count": 268,
"execution_count": 17,
"id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
"metadata": {},
"outputs": [],
@ -1908,7 +1910,7 @@
},
{
"cell_type": "code",
"execution_count": 269,
"execution_count": 18,
"id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
"metadata": {},
"outputs": [
@ -2210,7 +2212,7 @@
"[354365 rows x 15 columns]"
]
},
"execution_count": 269,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@ -2221,7 +2223,7 @@
},
{
"cell_type": "code",
"execution_count": 289,
"execution_count": 19,
"id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
"metadata": {},
"outputs": [
@ -2237,8 +2239,8 @@
"Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n",
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 10:26:14 Log-Likelihood: -83135.\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 07:58:13 Log-Likelihood: -83135.\n",
"converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n",
@ -2276,12 +2278,226 @@
"print(result.summary())"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully (Exit mode 0)\n",
" Current function value: 0.23562928627877766\n",
" Iterations: 240\n",
" Function evaluations: 243\n",
" Gradient evaluations: 240\n",
"const 0.000000e+00\n",
"nb_tickets 2.477006e-01\n",
"nb_purchases 1.636902e-03\n",
"total_amount 8.839088e-04\n",
"nb_suppliers 1.906550e-65\n",
"vente_internet_max 0.000000e+00\n",
"purchase_date_min 0.000000e+00\n",
"purchase_date_max 0.000000e+00\n",
"nb_tickets_internet 7.232680e-112\n",
"is_email_true 8.202187e-08\n",
"opt_in 0.000000e+00\n",
"gender_female 1.624424e-170\n",
"gender_male 4.961315e-220\n",
"nb_campaigns 6.276733e-205\n",
"nb_campaigns_opened 2.228531e-176\n",
"dtype: float64\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2111\n",
"Time: 10:45:37 Log-Likelihood: -83152.\n",
"converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -3.1162 0.081 -38.383 0.000 -3.275 -2.957\n",
"nb_tickets -0.0136 0.012 -1.156 0.248 -0.037 0.009\n",
"nb_purchases -0.0385 0.012 -3.149 0.002 -0.063 -0.015\n",
"total_amount 0.0588 0.018 3.325 0.001 0.024 0.094\n",
"nb_suppliers 0.1638 0.010 17.085 0.000 0.145 0.183\n",
"vente_internet_max -0.8651 0.011 -82.182 0.000 -0.886 -0.844\n",
"purchase_date_min 0.5790 0.015 39.391 0.000 0.550 0.608\n",
"purchase_date_max -1.4088 0.016 -89.101 0.000 -1.440 -1.378\n",
"nb_tickets_internet 0.2857 0.013 22.475 0.000 0.261 0.311\n",
"is_email_true 0.4224 0.079 5.363 0.000 0.268 0.577\n",
"opt_in -1.9818 0.019 -106.856 0.000 -2.018 -1.945\n",
"gender_female 0.6553 0.024 27.835 0.000 0.609 0.701\n",
"gender_male 0.7578 0.024 31.663 0.000 0.711 0.805\n",
"nb_campaigns 0.2835 0.009 30.547 0.000 0.265 0.302\n",
"nb_campaigns_opened 0.2061 0.007 28.315 0.000 0.192 0.220\n",
"=======================================================================================\n"
]
}
],
"source": [
"# 2.bis on fait de même pour un modèle logit avec pénalité \n",
"# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
"\n",
"# sans spécification, le alpha optimal est déterminé par cross validation\n",
"# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
"# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
"\n",
"result = model_logit.fit_regularized(method='l1', alpha = 32)\n",
"\n",
"print(result.pvalues)\n",
"print(result.summary())"
]
},
{
"cell_type": "markdown",
"id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8",
"metadata": {},
"source": [
"### Other method : take into account the weigths ! Pb : with this method, no penalty allowed"
]
},
{
"cell_type": "code",
"execution_count": 247,
"id": "2e3ca381-54e3-445b-bb37-d7ce953cb856",
"metadata": {},
"outputs": [],
"source": [
"# define a function to generate summaries of logit model\n",
"\n",
"def model_logit(X, y, weight_dict, add_constant=False) :\n",
" # Generate sample weights based on class weights computed earlier\n",
" sample_weights = np.array([weight_dict[class_] for class_ in y])\n",
"\n",
" if add_constant :\n",
" X_const = sm.add_constant(X)\n",
" else :\n",
" X_const = X\n",
" \n",
" # Use GLM from statsmodels with Binomial family for logistic regression\n",
" model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n",
" \n",
" # fit without penalty\n",
" result = model.fit()\n",
"\n",
" result_summary = result.summary()\n",
" \n",
" return result_summary"
]
},
{
"cell_type": "code",
"execution_count": 248,
"id": "4cd424a0-7c55-47ff-840e-1354e8dcf863",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: GLM Df Residuals: 354350\n",
"Model Family: Binomial Df Model: 14\n",
"Link Function: Logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -1.8693e+05\n",
"Date: Thu, 21 Mar 2024 Deviance: 3.7387e+05\n",
"Time: 13:19:33 Pearson chi2: 1.97e+16\n",
"No. Iterations: 100 Pseudo R-squ. (CS): 0.2820\n",
"Covariance Type: nonrobust \n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -1.3943 0.062 -22.456 0.000 -1.516 -1.273\n",
"nb_tickets -0.3312 0.016 -20.967 0.000 -0.362 -0.300\n",
"nb_purchases 0.9258 0.098 9.491 0.000 0.735 1.117\n",
"total_amount 0.8922 0.042 21.393 0.000 0.810 0.974\n",
"nb_suppliers 0.2238 0.007 32.137 0.000 0.210 0.237\n",
"vente_internet_max -0.7453 0.007 -100.473 0.000 -0.760 -0.731\n",
"purchase_date_min 0.7123 0.015 46.063 0.000 0.682 0.743\n",
"purchase_date_max -1.3328 0.017 -79.297 0.000 -1.366 -1.300\n",
"nb_tickets_internet 0.1784 0.011 16.366 0.000 0.157 0.200\n",
"is_email_true 0.8635 0.061 14.086 0.000 0.743 0.984\n",
"opt_in -1.7487 0.010 -174.737 0.000 -1.768 -1.729\n",
"gender_female 0.8084 0.013 60.803 0.000 0.782 0.835\n",
"gender_male 0.8731 0.014 64.332 0.000 0.846 0.900\n",
"nb_campaigns 0.1751 0.006 31.101 0.000 0.164 0.186\n",
"nb_campaigns_opened 0.2962 0.005 54.145 0.000 0.285 0.307\n",
"=======================================================================================\n"
]
}
],
"source": [
"# with the function\n",
"\n",
"# 1. logit with weights\n",
"results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n",
"print(results_logit_weight)"
]
},
{
"cell_type": "code",
"execution_count": 252,
"id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: GLM Df Residuals: 354350\n",
"Model Family: Binomial Df Model: 14\n",
"Link Function: Logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -83141.\n",
"Date: Thu, 21 Mar 2024 Deviance: 1.6628e+05\n",
"Time: 13:20:06 Pearson chi2: 4.52e+15\n",
"No. Iterations: 8 Pseudo R-squ. (CS): 0.1180\n",
"Covariance Type: nonrobust \n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -3.6025 0.091 -39.755 0.000 -3.780 -3.425\n",
"nb_tickets -0.0230 0.010 -2.191 0.028 -0.044 -0.002\n",
"nb_purchases -0.0519 0.014 -3.609 0.000 -0.080 -0.024\n",
"total_amount 0.0799 0.021 3.841 0.000 0.039 0.121\n",
"nb_suppliers 0.1694 0.010 17.662 0.000 0.151 0.188\n",
"vente_internet_max -0.8764 0.011 -82.965 0.000 -0.897 -0.856\n",
"purchase_date_min 0.5881 0.015 39.936 0.000 0.559 0.617\n",
"purchase_date_max -1.4197 0.016 -89.592 0.000 -1.451 -1.389\n",
"nb_tickets_internet 0.2895 0.013 22.652 0.000 0.264 0.315\n",
"is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n",
"opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n",
"gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n",
"gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n",
"nb_campaigns 0.2850 0.009 30.633 0.000 0.267 0.303\n",
"nb_campaigns_opened 0.2061 0.007 28.245 0.000 0.192 0.220\n",
"=======================================================================================\n"
]
}
],
"source": [
"# 2. logit without weights\n",
"\n",
"results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n",
"print(results_logit)"
]
},
{
"cell_type": "markdown",
"id": "36c5e770-72b3-4482-ad61-45b511a11f06",
"metadata": {},
"source": [
"## graphique LASSO - quelles variables sont impotantes dans le modèle ? "
"## graphique LASSO - quelles variables sont importantes dans le modèle ? "
]
},
{

410
utils_ml.py Normal file
View File

@ -0,0 +1,410 @@
import pandas as pd
import numpy as np
import os
import s3fs
import re
import io
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
import pickle
import warnings
def load_train_test(type_of_activity, type_of_model):
BUCKET = f"projet-bdc2324-team1/Generalization_v2/{type_of_activity}"
File_path_train = BUCKET + "/Train_set.csv"
File_path_test = BUCKET + "/Test_set.csv"
with fs.open( File_path_train, mode="rb") as file_in:
dataset_train = pd.read_csv(file_in, sep=",")
# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
with fs.open(File_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
if type_of_model=='premium':
dataset_train['company'] = dataset_train['customer_id'].apply(lambda x: x.split('_')[0])
dataset_test['company'] = dataset_test['customer_id'].apply(lambda x: x.split('_')[0])
dataset_train = dataset_train[dataset_train['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
dataset_test = dataset_test[dataset_test['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
return dataset_train, dataset_test
def save_file_s3(File_name, type_of_activity, type_of_model, model):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
if model_path:
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
else:
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv'
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
result_set.to_csv(file_out, index = False)
def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
model_bytes = pickle.dumps(classifier)
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
f.write(model_bytes)
def compute_recall(group):
return recall_score(group['y_has_purchased'], group['prediction'])
def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
test = dataset_test.copy()
test['prediction'] = y_pred
test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True)
def features_target_split(dataset_train, dataset_test):
features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',
'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',
'target_jeune', 'target_abonne']
X_train = dataset_train[features_l]
y_train = dataset_train[['y_has_purchased']]
X_test = dataset_test[features_l]
y_test = dataset_test[['y_has_purchased']]
return X_train, X_test, y_train, y_test
def preprocess(type_of_model, type_of_activity):
numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']
binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']
if type_of_activity=='musee':
numeric_features.remove('time_to_open')
if type_of_model=='premium':
if type_of_activity=='musique':
binary_features.extend(['target_optin', 'target_newsletter'])
elif type_of_activity=='sport':
binary_features.extend(['target_jeune', 'target_entreprise', 'target_abonne'])
else:
binary_features.extend([ 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter'])
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value=0)),
("scaler", StandardScaler())
])
binary_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
])
preproc = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("bin", binary_transformer, binary_features)
]
)
return preproc
def draw_confusion_matrix(y_test, y_pred, model):
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model)
def draw_roc_curve(X_test, y_pred_prob, model):
# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
# Calcul de l'aire sous la courbe ROC (AUC)
roc_auc = auc(fpr, tpr)
plt.figure(figsize = (14, 8))
plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
plt.grid(color='gray', linestyle='--', linewidth=0.5)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve", size=18)
plt.legend(loc="lower right")
plt.show()
save_file_s3("Roc_curve_", type_of_activity, type_of_model, model)
def draw_calibration_curve(X_test, y_pred_prob, model):
frac_pos, mean_pred = calibration_curve(y_test, y_pred_prob, n_bins=10)
# Plot the calibration curve
plt.plot(mean_pred, frac_pos, 's-', label=model)
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
plt.xlabel('Mean predicted value')
plt.ylabel('Fraction of positive predictions')
plt.title("Calibration Curve")
plt.legend()
plt.show()
save_file_s3("Calib_curve_", type_of_activity, type_of_model, model)
def draw_features_importance(pipeline, model, randomF = False):
if randomF:
coefficients = pipeline.named_steps[model].feature_importances_
else:
coefficients = pipeline.named_steps[model].coef_[0]
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
# Tracer l'importance des caractéristiques
plt.figure(figsize=(12, 8))
plt.barh(feature_names, coefficients, color='skyblue')
plt.xlabel("Features' Importance")
plt.ylabel('Caractéristiques')
plt.title("Features' Importance")
plt.grid(True)
plt.show()
save_file_s3("Features_", type_of_activity, type_of_model, model)
def draw_prob_distribution(y_pred_prob, model):
plt.figure(figsize=(10, 8))
plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
plt.xlim(0, 1)
plt.ylim(0, None)
plt.title('Histogramme des probabilités pour la classe 1')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
save_file_s3("prob_dist_", type_of_activity, type_of_model, model)
def draw_prob_distribution_companies(y_pred_prob, model):
test = dataset_test.copy()
test['probability to buy'] = y_pred_prob
test['company'] = test['customer_id'].str.split('_', expand=True)[0]
sns.histplot(data=test, x='probability to buy', hue='company', element='step',
stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
plt.xlim(0, 1)
plt.ylim(0, None)
plt.title('Histogram of probabilities for class 1 by company')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model)
def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
max_iter=5000, n_jobs=-1))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "LogisticRegression_Benchmark"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result
def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
y_train = y_train['y_has_purchased']
param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
'LogisticRegression_cv__penalty': ['l1', 'l2'],
'LogisticRegression_cv__class_weight': ['balanced', weight_dict]}
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))
])
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
n_jobs=-1)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
best_pipeline = grid_search.best_estimator_
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "LogisticRegression_cv"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_features_importance(best_pipeline, 'LogisticRegression_cv')
draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search)
return model_result
def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('randomF', RandomForestClassifier(class_weight = weight_dict,
n_jobs=-1))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "randomF"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_features_importance(pipeline, 'randomF', randomF=True)
draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result
def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
y_train = y_train['y_has_purchased']
param_grid = {
'randomF_cv__n_estimators': [100, 300],
'randomF_cv__max_features': ['sqrt', 'log2'],
'randomF_cv__min_samples_split': [2, 10],
'randomF_cv__min_samples_leaf': [1, 4],
'randomF_cv__class_weight': [weight_dict]
}
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('randomF_cv', RandomForestClassifier(n_jobs=-1))
])
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
n_jobs=-1)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
best_pipeline = grid_search.best_estimator_
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "randomF_cv"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search)
return model_result
def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
unique_classes, counts = np.unique(y_train, return_counts=True)
class_priors = counts / counts.sum()
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('Naive_Bayes', GaussianNB(priors=class_priors))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "Naive_Bayes"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_prob_distribution(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result

27
utils_segmentation.py Normal file
View File

@ -0,0 +1,27 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
def load_model(type_of_activity, model):
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f:
model_bytes = f.read()
model = pickle.loads(model_bytes)
return model
def load_test_file(type_of_activity):
file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
with fs.open(file_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
return dataset_test

438
utils_stat_desc.py Normal file
View File

@ -0,0 +1,438 @@
import pandas as pd
import os
import s3fs
import io
import warnings
from datetime import date, timedelta, datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
def load_files(nb_compagnie):
customer = pd.DataFrame()
campaigns_brut = pd.DataFrame()
campaigns_kpi = pd.DataFrame()
products = pd.DataFrame()
tickets = pd.DataFrame()
targets = pd.DataFrame()
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
for directory_path in nb_compagnie:
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_input_databases(directory_path, file_name = "target_information")
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC'))
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
df_target_KPI = targets_KPI(df_target = df_target_information)
# Merge and
df_target_KPI = pd.merge(df_customerplus_clean_0[['customer_id']], df_target_KPI, how = 'left', on = 'customer_id')
targets_columns = list(df_target_KPI.columns)
targets_columns.remove('customer_id')
df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
df_tickets_kpi["number_company"]=int(directory_path)
df_campaigns_brut["number_company"]=int(directory_path)
df_campaigns_kpi["number_company"]=int(directory_path)
df_customerplus_clean["number_company"]=int(directory_path)
df_target_information["number_company"]=int(directory_path)
df_target_KPI["number_company"]=int(directory_path)
# Traitement des index
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
<<<<<<< HEAD
# Remove companies' outliers
df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
# harmonize set of customers across databases
customer_id = df_tickets_kpi['customer_id'].to_list()
for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
dataset = dataset[dataset['customer_id'].isin(customer_id)]
=======
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
>>>>>>> main
# Concaténation
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
targets = pd.concat([targets, df_target_KPI], ignore_index=True)
return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
def remove_outlier_total_amount(tickets):
Q1 = tickets['total_amount'].quantile(0.25)
Q3 = tickets['total_amount'].quantile(0.75)
IQR = Q3 - Q1
upper = Q3 +1.5*IQR
outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
tickets = tickets[~tickets['customer_id'].isin(outliers)]
return tickets
def save_file_s3(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def outlier_detection(tickets, company_list, show_diagram=False):
outlier_list = list()
for company in company_list:
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
total_amount_share_index = total_amount_share.set_index('customer_id')
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
#print('df circulaire : ', df_circulaire.head())
top = df_circulaire[:1]
#print('top : ', top)
outlier_list.append(top.index[0])
rest = df_circulaire[1:]
rest_sum = rest.sum()
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
if show_diagram:
plt.figure(figsize=(3, 3))
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
plt.axis('equal')
plt.title(f'Répartition des montants totaux pour la compagnie {company}')
plt.show()
return outlier_list
def valid_customer_detection(products, campaigns_brut):
products_valid = products[products['purchase_date']>="2021-05-01"]
consumer_valid_product = products_valid['customer_id'].to_list()
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
consumer_valid = consumer_valid_product + consumer_valid_campaigns
return consumer_valid
def identify_purchase_during_target_periode(products):
products_target_period = products[(products['purchase_date']>="2022-11-01")
& (products['purchase_date']<="2023-11-01")]
customer_target_period = products_target_period['customer_id'].to_list()
return customer_target_period
def remove_elements(lst, elements_to_remove):
return ''.join([x for x in lst if x not in elements_to_remove])
def compute_nb_clients(customer, type_of_activity):
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
plt.xlabel('Company')
plt.ylabel("Number of clients (thousands)")
plt.title(f"Number of clients Across {type_of_activity} Companies")
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
plt.show()
save_file_s3("nb_clients_", type_of_activity)
def maximum_price_paid(customer, type_of_activity):
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
plt.xlabel('Company Number')
plt.ylabel("Maximal price of a ticket Prix")
plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies")
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
plt.show()
save_file_s3("Maximal_price_", type_of_activity)
def target_proportion(customer, type_of_activity):
df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
'customer_id' : 'nunique'}).reset_index()
df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
plt.xlabel('Company Number')
plt.ylabel('Share (%)')
plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
plt.show()
save_file_s3("share_target_", type_of_activity)
def mailing_consent(customer, type_of_activity):
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
mailing_consent["opt_in"] *= 100
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
plt.xlabel('Company Number')
plt.ylabel('Mailing Consent (%)')
plt.title(f'Consent of mailing Across {type_of_activity} Companies')
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
plt.show()
save_file_s3("mailing_consent_", type_of_activity)
def mailing_consent_by_target(customer):
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
# Création du barplot groupé
fig, ax = plt.subplots(figsize=(10, 6))
categories = df_graph["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
# Grouper les données par label et créer les barres groupées
for label in df_graph["has_purchased_target_period"].unique():
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
label_printed = "Purchase" if label else "No purchase"
ax.bar(bar_positions, values, bar_width, label=label_printed)
# Mise à jour des positions des barres pour le prochain groupe
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
ax.set_xlabel('Company Number')
ax.set_ylabel('Mailing Consent (%)')
ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies')
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
# Affichage du plot
plt.show()
save_file_s3("mailing_consent_target_", type_of_activity)
def gender_bar(customer, type_of_activity):
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
company_genders["gender_male"] *= 100
company_genders["gender_female"] *= 100
company_genders["gender_other"] *= 100
# Création du barplot
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male")
plt.bar(company_genders["number_company"], company_genders["gender_female"],
bottom = company_genders["gender_male"], label = "Female")
plt.bar(company_genders["number_company"], company_genders["gender_other"],
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown")
plt.xlabel('Company Number')
plt.ylabel("Frequency (%)")
plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies")
plt.legend()
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
plt.show()
save_file_s3("gender_bar_", type_of_activity)
def country_bar(customer, type_of_activity):
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
company_country_fr["country_fr"] *= 100
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
plt.xlabel('Company Number')
plt.ylabel("Share of French Customer (%)")
plt.title(f"Share of French Customer Across {type_of_activity} Companies")
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
plt.show()
save_file_s3("country_bar_", type_of_activity)
def lazy_customer_plot(campaigns_kpi, type_of_activity):
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
plt.xlabel('Company Number')
plt.title(f"Share of Customers who did not Open Mail Across {type_of_activity} Companies")
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
plt.show()
save_file_s3("lazy_customer_", type_of_activity)
def campaigns_effectiveness(customer, type_of_activity):
campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
fig, ax = plt.subplots(figsize=(10, 6))
categories = campaigns_effectiveness["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
# Grouper les données par label et créer les barres groupées
for label in campaigns_effectiveness["has_purchased_target_period"].unique():
label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
label_printed = "Purchase" if label else "No purchase"
ax.bar(bar_positions, values, bar_width, label=label_printed)
# Mise à jour des positions des barres pour le prochain groupe
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
ax.set_xlabel('Company Number')
ax.set_ylabel('Share of Consent (%)')
ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)")
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
plt.show()
save_file_s3("campaigns_effectiveness_", type_of_activity)
def sale_dynamics(products, campaigns_brut, type_of_activity):
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
# Mois du premier mails
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
# Fusion
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
# Mois à partir duquel le client est considere comme connu
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
# Nombre de commande par mois
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
# Nombre de commande par mois par type de client
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
# Graphique en nombre de commande
purchases_graph = nb_purchases_graph
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers")
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers")
# commande pr afficher slt
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
plt.xlabel('Month')
plt.ylabel("Number of Sales")
plt.title(f"Number of Sales Across {type_of_activity} Companies")
plt.legend()
plt.show()
save_file_s3("sale_dynamics_", type_of_activity)
def tickets_internet(tickets, type_of_activity):
nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
nb_tickets_internet['prop_purchases_internet'] *=100
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"])
plt.xlabel('Company Number')
plt.ylabel("Share of Purchases Bought Online (%)")
plt.title(f"Share of Online Purchases Across {type_of_activity} Companies")
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
plt.show()
save_file_s3("tickets_internet_", type_of_activity)
def already_bought_online(tickets, type_of_activity):
nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
'customer_id' : 'nunique'}
).reset_index())
nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100
plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"])
plt.xlabel('Company Number')
plt.ylabel("Share of Customer who Bought Online at least once (%)")
plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies")
plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
plt.show()
save_file_s3("First_buy_internet_", type_of_activity)
def box_plot_price_tickets(tickets, type_of_activity):
price_tickets = tickets[(tickets['total_amount'] > 0)]
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
plt.title(f"Box plot of price tickets Across {type_of_activity} Companies")
plt.show()
save_file_s3("box_plot_price_tickets_", type_of_activity)
def target_description(targets, type_of_activity):
describe_target = targets.groupby('number_company').agg(
prop_target_jeune=('target_jeune', lambda x: (x.sum() / x.count())*100),
prop_target_scolaire=('target_scolaire', lambda x: (x.sum() / x.count())*100),
prop_target_entreprise=('target_entreprise', lambda x: (x.sum() / x.count())*100),
prop_target_famille=('target_famille', lambda x: (x.sum() / x.count())*100),
prop_target_optin=('target_optin', lambda x: (x.sum() / x.count())*100),
prop_target_optout=('target_optout', lambda x: (x.sum() / x.count())*100),
prop_target_newsletter=('target_newsletter', lambda x: (x.sum() / x.count())*100),
prop_target_abonne=('target_abonne', lambda x: (x.sum() / x.count())*100))
plot = describe_target.plot.bar()
# Adding a title
plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies")
# Adding labels for x and y axes
plot.set_xlabel("Company Number")
plot.set_ylabel("Target Proportion")
plot.set_xticklabels(plot.get_xticklabels(), rotation=0, horizontalalignment='center')
# Adding a legend
plot.legend(["Youth", "School", "Enterprise", "Family", "Optin", "Optout", "Newsletter", "Subscriber"], title="Target Category")
save_file_s3("target_category_proportion_", type_of_activity)