Merge pull request 'generalization' (#9) from generalization into main
Reviewed-on: #9
This commit is contained in:
		
						commit
						a0256c551b
					
				
							
								
								
									
										70
									
								
								0_4_Generate_stat_desc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								0_4_Generate_stat_desc.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,70 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
import os
 | 
			
		||||
import io
 | 
			
		||||
import s3fs
 | 
			
		||||
import re
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
# Ignore warning
 | 
			
		||||
warnings.filterwarnings('ignore')
 | 
			
		||||
 | 
			
		||||
exec(open('0_KPI_functions.py').read())
 | 
			
		||||
exec(open('utils_stat_desc.py').read())
 | 
			
		||||
 | 
			
		||||
# Create filesystem object
 | 
			
		||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 | 
			
		||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 | 
			
		||||
 | 
			
		||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
 | 
			
		||||
            'sport': ['5', '6', '7', '8', '9'],
 | 
			
		||||
            'musique' : ['10', '11', '12', '13', '14']}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 | 
			
		||||
list_of_comp = companies[type_of_activity] 
 | 
			
		||||
 | 
			
		||||
# Load files
 | 
			
		||||
customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
 | 
			
		||||
 | 
			
		||||
# Identify anonymous customer for each company and remove them from our datasets
 | 
			
		||||
outlier_list = outlier_detection(tickets, list_of_comp)
 | 
			
		||||
 | 
			
		||||
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
 | 
			
		||||
customer_valid_list = valid_customer_detection(products, campaigns_brut)
 | 
			
		||||
 | 
			
		||||
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
 | 
			
		||||
 | 
			
		||||
for dataset in databases:
 | 
			
		||||
    dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
 | 
			
		||||
    dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
 | 
			
		||||
    #print(f'shape of {dataset} : ', dataset.shape)
 | 
			
		||||
 | 
			
		||||
# Identify customer who bought during the period of y
 | 
			
		||||
customer_target_period = identify_purchase_during_target_periode(products)
 | 
			
		||||
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
 | 
			
		||||
 | 
			
		||||
# Generate graph and automatically saved them in the bucket
 | 
			
		||||
compute_nb_clients(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
maximum_price_paid(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
mailing_consent(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
mailing_consent_by_target(customer)
 | 
			
		||||
 | 
			
		||||
gender_bar(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
country_bar(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
lazy_customer_plot(campaigns_kpi, type_of_activity)
 | 
			
		||||
 | 
			
		||||
campaigns_effectiveness(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
sale_dynamics(products, campaigns_brut, type_of_activity)
 | 
			
		||||
 | 
			
		||||
tickets_internet(tickets, type_of_activity)
 | 
			
		||||
 | 
			
		||||
already_bought_online(tickets, type_of_activity)
 | 
			
		||||
 | 
			
		||||
box_plot_price_tickets(tickets, type_of_activity)
 | 
			
		||||
							
								
								
									
										103
									
								
								0_5_Machine_Learning.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										103
									
								
								0_5_Machine_Learning.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,103 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
import os
 | 
			
		||||
import io
 | 
			
		||||
import s3fs
 | 
			
		||||
import re
 | 
			
		||||
from sklearn.linear_model import LogisticRegression
 | 
			
		||||
from sklearn.ensemble import RandomForestClassifier
 | 
			
		||||
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
 | 
			
		||||
from sklearn.utils import class_weight
 | 
			
		||||
from sklearn.neighbors import KNeighborsClassifier
 | 
			
		||||
from sklearn.naive_bayes import GaussianNB
 | 
			
		||||
from sklearn.pipeline import Pipeline
 | 
			
		||||
from sklearn.compose import ColumnTransformer
 | 
			
		||||
from sklearn.calibration import calibration_curve
 | 
			
		||||
from sklearn.preprocessing import OneHotEncoder
 | 
			
		||||
from sklearn.impute import SimpleImputer
 | 
			
		||||
from sklearn.model_selection import GridSearchCV
 | 
			
		||||
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 | 
			
		||||
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
 | 
			
		||||
import seaborn as sns
 | 
			
		||||
import matplotlib.pyplot as plt
 | 
			
		||||
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
 | 
			
		||||
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
 | 
			
		||||
import pickle
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
exec(open('utils_ml.py').read())
 | 
			
		||||
 | 
			
		||||
warnings.filterwarnings('ignore')
 | 
			
		||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
 | 
			
		||||
warnings.filterwarnings("ignore", category=DataConversionWarning)
 | 
			
		||||
 | 
			
		||||
# choose the type of companies for which you want to run the pipeline
 | 
			
		||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 | 
			
		||||
 | 
			
		||||
# load train and test set
 | 
			
		||||
# Create filesystem object
 | 
			
		||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 | 
			
		||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 | 
			
		||||
 | 
			
		||||
dataset_train, dataset_test = load_train_test(type_of_activity )
 | 
			
		||||
 | 
			
		||||
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
 | 
			
		||||
 | 
			
		||||
print("Shape train : ", X_train.shape)
 | 
			
		||||
print("Shape test : ", X_test.shape)
 | 
			
		||||
 | 
			
		||||
# processing
 | 
			
		||||
 | 
			
		||||
weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
 | 
			
		||||
                                            y = y_train['y_has_purchased'])
 | 
			
		||||
 | 
			
		||||
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
 | 
			
		||||
            'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
 | 
			
		||||
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
 | 
			
		||||
 | 
			
		||||
numeric_transformer = Pipeline(steps=[
 | 
			
		||||
    #("imputer", SimpleImputer(strategy="mean")),  
 | 
			
		||||
    ("scaler", StandardScaler()) 
 | 
			
		||||
])
 | 
			
		||||
 | 
			
		||||
categorical_features = ['opt_in']  
 | 
			
		||||
 | 
			
		||||
# Transformer for the categorical features
 | 
			
		||||
categorical_transformer = Pipeline(steps=[
 | 
			
		||||
    #("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values with the most frequent
 | 
			
		||||
    ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
 | 
			
		||||
])
 | 
			
		||||
 | 
			
		||||
preproc = ColumnTransformer(
 | 
			
		||||
    transformers=[
 | 
			
		||||
        ("num", numeric_transformer, numeric_features),
 | 
			
		||||
        ("cat", categorical_transformer, categorical_features)
 | 
			
		||||
    ]
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# Object for storing results
 | 
			
		||||
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
 | 
			
		||||
 | 
			
		||||
# Naive Bayes
 | 
			
		||||
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
 | 
			
		||||
print("Naive Bayes : Done")
 | 
			
		||||
 | 
			
		||||
# Logistic Regression
 | 
			
		||||
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
 | 
			
		||||
print("Logistic : Done")
 | 
			
		||||
"""
 | 
			
		||||
model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
 | 
			
		||||
print("Logistic CV : Done")
 | 
			
		||||
 | 
			
		||||
# Random Forest
 | 
			
		||||
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
 | 
			
		||||
print("Random Forest : Done")
 | 
			
		||||
model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
 | 
			
		||||
print("Random Forest CV: Done")
 | 
			
		||||
"""
 | 
			
		||||
# Save result
 | 
			
		||||
save_result_set_s3(model_result , "resultat", type_of_activity)
 | 
			
		||||
							
								
								
									
										40
									
								
								0_6_Segmentation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								0_6_Segmentation.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,40 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
import os
 | 
			
		||||
import io
 | 
			
		||||
import s3fs
 | 
			
		||||
import re
 | 
			
		||||
import pickle
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
exec(open('utils_segmentation.py').read())
 | 
			
		||||
warnings.filterwarnings('ignore')
 | 
			
		||||
 | 
			
		||||
# Create filesystem object
 | 
			
		||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 | 
			
		||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 | 
			
		||||
 | 
			
		||||
# choose the type of companies for which you want to run the pipeline
 | 
			
		||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 | 
			
		||||
 | 
			
		||||
# load test set
 | 
			
		||||
dataset_test = load_test_file(type_of_activity)
 | 
			
		||||
 | 
			
		||||
# Load Model 
 | 
			
		||||
model = load_model(type_of_activity, 'LogisticRegression_Benchmark')
 | 
			
		||||
 | 
			
		||||
# Processing
 | 
			
		||||
X_test = dataset_test[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
 | 
			
		||||
            'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner',
 | 
			
		||||
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']]
 | 
			
		||||
 | 
			
		||||
y_test = dataset_test[['y_has_purchased']]
 | 
			
		||||
 | 
			
		||||
# Prediction
 | 
			
		||||
y_pred_prob = model.predict_proba(X_test)[:, 1]
 | 
			
		||||
 | 
			
		||||
# Add probability to dataset_test
 | 
			
		||||
dataset_test['Probability_to_buy'] = y_pred_prob
 | 
			
		||||
print('probability added to dataset_test')
 | 
			
		||||
print(dataset_test.head())
 | 
			
		||||
							
								
								
									
										148
									
								
								Descriptive_statistics/debug.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										148
									
								
								Descriptive_statistics/debug.ipynb
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										68
									
								
								Descriptive_statistics/generate_stat_desc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								Descriptive_statistics/generate_stat_desc.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,68 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
import os
 | 
			
		||||
import io
 | 
			
		||||
import s3fs
 | 
			
		||||
import re
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
# Ignore warning
 | 
			
		||||
warnings.filterwarnings('ignore')
 | 
			
		||||
 | 
			
		||||
exec(open('../0_KPI_functions.py').read())
 | 
			
		||||
exec(open('plot.py').read())
 | 
			
		||||
 | 
			
		||||
# Create filesystem object
 | 
			
		||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 | 
			
		||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 | 
			
		||||
 | 
			
		||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
 | 
			
		||||
            'sport': ['5'],
 | 
			
		||||
            'musique' : ['10', '11', '12', '13', '14']}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 | 
			
		||||
list_of_comp = companies[type_of_activity] 
 | 
			
		||||
 | 
			
		||||
# Load files
 | 
			
		||||
customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
 | 
			
		||||
 | 
			
		||||
# Identify anonymous customer for each company and remove them from our datasets
 | 
			
		||||
outlier_list = outlier_detection(tickets, list_of_comp)
 | 
			
		||||
 | 
			
		||||
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
 | 
			
		||||
customer_valid_list = valid_customer_detection(products, campaigns_brut)
 | 
			
		||||
 | 
			
		||||
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
 | 
			
		||||
 | 
			
		||||
for dataset in databases:
 | 
			
		||||
    dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
 | 
			
		||||
    dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
 | 
			
		||||
    #print(f'shape of {dataset} : ', dataset.shape)
 | 
			
		||||
 | 
			
		||||
# Identify customer who bought during the period of y
 | 
			
		||||
customer_target_period = identify_purchase_during_target_periode(products)
 | 
			
		||||
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
 | 
			
		||||
 | 
			
		||||
# Generate graph and automatically saved them in the bucket
 | 
			
		||||
compute_nb_clients(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
maximum_price_paid(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
mailing_consent(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
mailing_consent_by_target(customer)
 | 
			
		||||
 | 
			
		||||
gender_bar(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
country_bar(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
lazy_customer_plot(campaigns_kpi, type_of_activity)
 | 
			
		||||
 | 
			
		||||
#campaigns_effectiveness(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
sale_dynamics(products, campaigns_brut, type_of_activity)
 | 
			
		||||
 | 
			
		||||
tickets_internet(tickets, type_of_activity)
 | 
			
		||||
 | 
			
		||||
box_plot_price_tickets(tickets, type_of_activity)
 | 
			
		||||
							
								
								
									
										328
									
								
								Descriptive_statistics/plot.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										328
									
								
								Descriptive_statistics/plot.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,328 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import os
 | 
			
		||||
import s3fs
 | 
			
		||||
import io
 | 
			
		||||
import warnings
 | 
			
		||||
from datetime import date, timedelta, datetime
 | 
			
		||||
import numpy as np
 | 
			
		||||
import matplotlib.pyplot as plt
 | 
			
		||||
import matplotlib.dates as mdates
 | 
			
		||||
import seaborn as sns
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_files(nb_compagnie):
 | 
			
		||||
    customer = pd.DataFrame()
 | 
			
		||||
    campaigns_brut = pd.DataFrame()
 | 
			
		||||
    campaigns_kpi = pd.DataFrame()
 | 
			
		||||
    products = pd.DataFrame()
 | 
			
		||||
    tickets = pd.DataFrame()
 | 
			
		||||
    
 | 
			
		||||
    # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
 | 
			
		||||
    for directory_path in nb_compagnie:
 | 
			
		||||
        df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
 | 
			
		||||
        df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
 | 
			
		||||
        df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
 | 
			
		||||
        df_target_information = display_databases(directory_path, file_name = "target_information")
 | 
			
		||||
        
 | 
			
		||||
        df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut) 
 | 
			
		||||
        df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
 | 
			
		||||
        df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
 | 
			
		||||
    
 | 
			
		||||
            
 | 
			
		||||
    # creation de la colonne Number compagnie, qui permettra d'agréger les résultats
 | 
			
		||||
        df_tickets_kpi["number_company"]=int(directory_path)
 | 
			
		||||
        df_campaigns_brut["number_company"]=int(directory_path)
 | 
			
		||||
        df_campaigns_kpi["number_company"]=int(directory_path)
 | 
			
		||||
        df_customerplus_clean["number_company"]=int(directory_path)
 | 
			
		||||
        df_target_information["number_company"]=int(directory_path)
 | 
			
		||||
    
 | 
			
		||||
    # Traitement des index
 | 
			
		||||
        df_tickets_kpi["customer_id"]= directory_path + '_' +  df_tickets_kpi['customer_id'].astype('str')
 | 
			
		||||
        df_campaigns_brut["customer_id"]= directory_path + '_' +  df_campaigns_brut['customer_id'].astype('str')
 | 
			
		||||
        df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
 | 
			
		||||
        df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
 | 
			
		||||
        df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
 | 
			
		||||
    
 | 
			
		||||
    # Concaténation
 | 
			
		||||
        customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
 | 
			
		||||
        campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
 | 
			
		||||
        campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) 
 | 
			
		||||
        tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
 | 
			
		||||
        products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
 | 
			
		||||
 | 
			
		||||
    return customer, campaigns_kpi, campaigns_brut, tickets, products
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_file_s3(File_name, type_of_activity):
 | 
			
		||||
    image_buffer = io.BytesIO()
 | 
			
		||||
    plt.savefig(image_buffer, format='png')
 | 
			
		||||
    image_buffer.seek(0)
 | 
			
		||||
    FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
 | 
			
		||||
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
 | 
			
		||||
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
 | 
			
		||||
        s3_file.write(image_buffer.read())
 | 
			
		||||
    plt.close()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def outlier_detection(tickets, company_list, show_diagram=False):
 | 
			
		||||
 | 
			
		||||
    outlier_list = list()
 | 
			
		||||
    
 | 
			
		||||
    for company in company_list:
 | 
			
		||||
        total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
 | 
			
		||||
        total_amount_share['CA'] = total_amount_share['total_amount'].sum()
 | 
			
		||||
        total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
 | 
			
		||||
        
 | 
			
		||||
        total_amount_share_index = total_amount_share.set_index('customer_id')
 | 
			
		||||
        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
 | 
			
		||||
        #print('df circulaire : ', df_circulaire.head())
 | 
			
		||||
        top = df_circulaire[:1]
 | 
			
		||||
        #print('top : ', top)
 | 
			
		||||
        outlier_list.append(top.index[0])
 | 
			
		||||
        rest = df_circulaire[1:]
 | 
			
		||||
    
 | 
			
		||||
        rest_sum = rest.sum()
 | 
			
		||||
        
 | 
			
		||||
        new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
 | 
			
		||||
        
 | 
			
		||||
        if show_diagram:
 | 
			
		||||
            plt.figure(figsize=(3, 3))
 | 
			
		||||
            plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
 | 
			
		||||
            plt.axis('equal')
 | 
			
		||||
            plt.title(f'Répartition des montants totaux pour la compagnie {company}')
 | 
			
		||||
            plt.show()
 | 
			
		||||
    return outlier_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def valid_customer_detection(products, campaigns_brut):
 | 
			
		||||
    products_valid = products[products['purchase_date']>="2021-05-01"]
 | 
			
		||||
    consumer_valid_product = products_valid['customer_id'].to_list()
 | 
			
		||||
 | 
			
		||||
    campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
 | 
			
		||||
    consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
 | 
			
		||||
 | 
			
		||||
    consumer_valid = consumer_valid_product + consumer_valid_campaigns
 | 
			
		||||
    return consumer_valid 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def identify_purchase_during_target_periode(products):
 | 
			
		||||
    products_target_period = products[(products['purchase_date']>="2022-11-01")
 | 
			
		||||
    & (products['purchase_date']<="2023-11-01")]
 | 
			
		||||
    customer_target_period = products_target_period['customer_id'].to_list()
 | 
			
		||||
    return customer_target_period
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
def remove_elements(lst, elements_to_remove):
 | 
			
		||||
    return ''.join([x for x in lst if x not in elements_to_remove])
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def compute_nb_clients(customer, type_of_activity):
 | 
			
		||||
    company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
 | 
			
		||||
    plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
 | 
			
		||||
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Number of clients (thousands)")
 | 
			
		||||
    plt.title(f"Number of clients for {type_of_activity}")
 | 
			
		||||
    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("nb_clients_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def maximum_price_paid(customer, type_of_activity):
 | 
			
		||||
    company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
 | 
			
		||||
    plt.bar(company_max_price["number_company"], company_max_price["max_price"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Maximal price of a ticket Prix")
 | 
			
		||||
    plt.title(f"Maximal price of a ticket for {type_of_activity}")
 | 
			
		||||
    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("Maximal_price_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def mailing_consent(customer, type_of_activity):
 | 
			
		||||
    mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
 | 
			
		||||
 | 
			
		||||
    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
 | 
			
		||||
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel('Consent')
 | 
			
		||||
    plt.title(f'Consent of mailing for {type_of_activity}')
 | 
			
		||||
    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("mailing_consent_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def mailing_consent_by_target(customer):
 | 
			
		||||
    df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
 | 
			
		||||
    # Création du barplot groupé
 | 
			
		||||
    fig, ax = plt.subplots(figsize=(10, 6))
 | 
			
		||||
    
 | 
			
		||||
    categories = df_graph["number_company"].unique()
 | 
			
		||||
    bar_width = 0.35
 | 
			
		||||
    bar_positions = np.arange(len(categories))
 | 
			
		||||
    
 | 
			
		||||
    # Grouper les données par label et créer les barres groupées
 | 
			
		||||
    for label in df_graph["has_purchased_target_period"].unique():
 | 
			
		||||
        label_data = df_graph[df_graph['has_purchased_target_period'] == label]
 | 
			
		||||
        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
 | 
			
		||||
    
 | 
			
		||||
        label_printed = "purchased" if label else "no purchase"
 | 
			
		||||
        ax.bar(bar_positions, values, bar_width, label=label_printed)
 | 
			
		||||
    
 | 
			
		||||
        # Mise à jour des positions des barres pour le prochain groupe
 | 
			
		||||
        bar_positions = [pos + bar_width for pos in bar_positions]
 | 
			
		||||
    
 | 
			
		||||
    # Ajout des étiquettes, de la légende, etc.
 | 
			
		||||
    ax.set_xlabel('Company')
 | 
			
		||||
    ax.set_ylabel('Consent')
 | 
			
		||||
    ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
 | 
			
		||||
    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
 | 
			
		||||
    ax.set_xticklabels(categories)
 | 
			
		||||
    ax.legend()
 | 
			
		||||
    
 | 
			
		||||
    # Affichage du plot
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("mailing_consent_target_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def gender_bar(customer, type_of_activity):
 | 
			
		||||
    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
 | 
			
		||||
    
 | 
			
		||||
    # Création du barplot
 | 
			
		||||
    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
 | 
			
		||||
    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
 | 
			
		||||
            bottom = company_genders["gender_male"], label = "Femme")
 | 
			
		||||
    plt.bar(company_genders["number_company"], company_genders["gender_other"], 
 | 
			
		||||
            bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Gender")
 | 
			
		||||
    plt.title(f"Gender of Customer for {type_of_activity}")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("gender_bar_", type_of_activity)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def country_bar(customer, type_of_activity):
 | 
			
		||||
    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
 | 
			
		||||
    plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of French Customer")
 | 
			
		||||
    plt.title(f"Share of French Customer for {type_of_activity}")
 | 
			
		||||
    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("country_bar_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def lazy_customer_plot(campaigns_kpi, type_of_activity):
 | 
			
		||||
    company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
 | 
			
		||||
    plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of Customers who did not open mail")
 | 
			
		||||
    plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
 | 
			
		||||
    plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("lazy_customer_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def campaigns_effectiveness(customer, type_of_activity):
 | 
			
		||||
 | 
			
		||||
    campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index()
 | 
			
		||||
 | 
			
		||||
    plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Number of Customers (thousands)")
 | 
			
		||||
    plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
    plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("campaigns_effectiveness_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def sale_dynamics(products, campaigns_brut, type_of_activity):
 | 
			
		||||
    purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
 | 
			
		||||
    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
 | 
			
		||||
    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
 | 
			
		||||
    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
 | 
			
		||||
    
 | 
			
		||||
    # Mois du premier mails
 | 
			
		||||
    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
 | 
			
		||||
    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
 | 
			
		||||
    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
 | 
			
		||||
    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
 | 
			
		||||
    
 | 
			
		||||
    # Fusion 
 | 
			
		||||
    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
 | 
			
		||||
                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
 | 
			
		||||
    
 | 
			
		||||
    # Mois à partir duquel le client est considere comme connu
 | 
			
		||||
    
 | 
			
		||||
    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
 | 
			
		||||
    
 | 
			
		||||
    # Nombre de commande par mois
 | 
			
		||||
    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
 | 
			
		||||
    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
 | 
			
		||||
    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
 | 
			
		||||
    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
 | 
			
		||||
    
 | 
			
		||||
    # Nombre de commande par mois par type de client
 | 
			
		||||
    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
 | 
			
		||||
    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
 | 
			
		||||
    
 | 
			
		||||
    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
 | 
			
		||||
    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
 | 
			
		||||
    
 | 
			
		||||
    # Graphique en nombre de commande
 | 
			
		||||
    purchases_graph = nb_purchases_graph
 | 
			
		||||
    
 | 
			
		||||
    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
 | 
			
		||||
    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
 | 
			
		||||
    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
 | 
			
		||||
    
 | 
			
		||||
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
 | 
			
		||||
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
 | 
			
		||||
            bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
    # commande pr afficher slt
 | 
			
		||||
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Month')
 | 
			
		||||
    plt.ylabel("Number of Sales")
 | 
			
		||||
    plt.title(f"Number of Sales for {type_of_activity}")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("sale_dynamics_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def tickets_internet(tickets, type_of_activity):
 | 
			
		||||
    nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
 | 
			
		||||
    nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
 | 
			
		||||
 | 
			
		||||
    plt.bar(nb_tickets_internet["number_company"],  nb_tickets_internet["Share_ticket_internet"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of Tickets Bought Online")
 | 
			
		||||
    plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
 | 
			
		||||
    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("tickets_internet_", type_of_activity)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def box_plot_price_tickets(tickets, type_of_activity):
 | 
			
		||||
    price_tickets = tickets[(tickets['total_amount'] > 0)]
 | 
			
		||||
    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
 | 
			
		||||
    plt.title(f"Box plot of price tickets for {type_of_activity}")
 | 
			
		||||
    plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("box_plot_price_tickets_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										8499
									
								
								Notebook_AR.ipynb
									
									
									
									
									
								
							
							
						
						
									
										8499
									
								
								Notebook_AR.ipynb
									
									
									
									
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										358
									
								
								utils_ml.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										358
									
								
								utils_ml.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,358 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
import os
 | 
			
		||||
import s3fs
 | 
			
		||||
import re
 | 
			
		||||
import io
 | 
			
		||||
from sklearn.linear_model import LogisticRegression
 | 
			
		||||
from sklearn.ensemble import RandomForestClassifier
 | 
			
		||||
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
 | 
			
		||||
from sklearn.utils import class_weight
 | 
			
		||||
from sklearn.neighbors import KNeighborsClassifier
 | 
			
		||||
from sklearn.naive_bayes import GaussianNB
 | 
			
		||||
from sklearn.pipeline import Pipeline
 | 
			
		||||
from sklearn.compose import ColumnTransformer
 | 
			
		||||
from sklearn.calibration import calibration_curve
 | 
			
		||||
from sklearn.preprocessing import OneHotEncoder
 | 
			
		||||
from sklearn.impute import SimpleImputer
 | 
			
		||||
from sklearn.model_selection import GridSearchCV
 | 
			
		||||
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 | 
			
		||||
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
 | 
			
		||||
import seaborn as sns
 | 
			
		||||
import matplotlib.pyplot as plt
 | 
			
		||||
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
 | 
			
		||||
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
 | 
			
		||||
 | 
			
		||||
import pickle
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_train_test(type_of_activity):
 | 
			
		||||
    BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
 | 
			
		||||
    File_path_train = BUCKET + "/Train_set.csv"
 | 
			
		||||
    File_path_test = BUCKET + "/Test_set.csv"
 | 
			
		||||
    
 | 
			
		||||
    with fs.open( File_path_train, mode="rb") as file_in:
 | 
			
		||||
        dataset_train = pd.read_csv(file_in, sep=",")
 | 
			
		||||
        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
 | 
			
		||||
 | 
			
		||||
    with fs.open(File_path_test, mode="rb") as file_in:
 | 
			
		||||
        dataset_test = pd.read_csv(file_in, sep=",")
 | 
			
		||||
        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
 | 
			
		||||
    
 | 
			
		||||
    return dataset_train, dataset_test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_file_s3(File_name, type_of_activity, model):
 | 
			
		||||
    image_buffer = io.BytesIO()
 | 
			
		||||
    plt.savefig(image_buffer, format='png')
 | 
			
		||||
    image_buffer.seek(0)
 | 
			
		||||
    FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
 | 
			
		||||
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
 | 
			
		||||
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
 | 
			
		||||
        s3_file.write(image_buffer.read())
 | 
			
		||||
    plt.close()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False):
 | 
			
		||||
    if model_path:
 | 
			
		||||
        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv'
 | 
			
		||||
    else:
 | 
			
		||||
        FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv'
 | 
			
		||||
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
 | 
			
		||||
        result_set.to_csv(file_out, index = False)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_model_s3(File_name, type_of_activity, model, classifier):
 | 
			
		||||
    model_bytes = pickle.dumps(classifier)
 | 
			
		||||
    FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl'
 | 
			
		||||
    with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
 | 
			
		||||
        f.write(model_bytes)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def compute_recall(group):
 | 
			
		||||
    return recall_score(group['y_has_purchased'], group['prediction'])
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
 | 
			
		||||
    test = dataset_test.copy()
 | 
			
		||||
    test['prediction'] = y_pred
 | 
			
		||||
    test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
 | 
			
		||||
    recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
 | 
			
		||||
    save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def features_target_split(dataset_train, dataset_test):
 | 
			
		||||
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
 | 
			
		||||
            'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner',
 | 
			
		||||
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
 | 
			
		||||
    X_train = dataset_train[features_l]
 | 
			
		||||
    y_train = dataset_train[['y_has_purchased']]
 | 
			
		||||
 | 
			
		||||
    X_test = dataset_test[features_l]
 | 
			
		||||
    y_test = dataset_test[['y_has_purchased']]
 | 
			
		||||
    return X_train, X_test, y_train, y_test
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def draw_confusion_matrix(y_test, y_pred, model):
 | 
			
		||||
    conf_matrix = confusion_matrix(y_test, y_pred)
 | 
			
		||||
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
 | 
			
		||||
    plt.xlabel('Predicted')
 | 
			
		||||
    plt.ylabel('Actual')
 | 
			
		||||
    plt.title('Confusion Matrix')
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("Confusion_matrix_", type_of_activity, model)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def draw_roc_curve(X_test, y_pred_prob, model):
 | 
			
		||||
    # Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
 | 
			
		||||
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
 | 
			
		||||
    
 | 
			
		||||
    # Calcul de l'aire sous la courbe ROC (AUC)
 | 
			
		||||
    roc_auc = auc(fpr, tpr)
 | 
			
		||||
    
 | 
			
		||||
    plt.figure(figsize = (14, 8))
 | 
			
		||||
    plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
 | 
			
		||||
    plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
 | 
			
		||||
    plt.grid(color='gray', linestyle='--', linewidth=0.5)
 | 
			
		||||
    plt.xlabel("False Positive Rate")
 | 
			
		||||
    plt.ylabel("True Positive Rate")
 | 
			
		||||
    plt.title("ROC Curve", size=18)
 | 
			
		||||
    plt.legend(loc="lower right")
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("Roc_curve_", type_of_activity, model)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def draw_calibration_curve(X_test, y_pred_prob, model):
 | 
			
		||||
    frac_pos, mean_pred = calibration_curve(y_test,  y_pred_prob, n_bins=10)
 | 
			
		||||
 | 
			
		||||
    # Plot the calibration curve
 | 
			
		||||
    plt.plot(mean_pred, frac_pos, 's-', label=model)
 | 
			
		||||
    plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
 | 
			
		||||
    plt.xlabel('Mean predicted value')
 | 
			
		||||
    plt.ylabel('Fraction of positive predictions')
 | 
			
		||||
    plt.title("Calibration Curve")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("Calib_curve_", type_of_activity, model)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def draw_features_importance(pipeline, model, randomF = False):
 | 
			
		||||
    if randomF:
 | 
			
		||||
        coefficients = pipeline.named_steps[model].feature_importances_
 | 
			
		||||
    else: 
 | 
			
		||||
        coefficients = pipeline.named_steps[model].coef_[0]
 | 
			
		||||
    
 | 
			
		||||
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
 | 
			
		||||
    # Tracer l'importance des caractéristiques
 | 
			
		||||
    plt.figure(figsize=(10, 6))
 | 
			
		||||
    plt.barh(feature_names, coefficients, color='skyblue')
 | 
			
		||||
    plt.xlabel("Features' Importance")
 | 
			
		||||
    plt.ylabel('Caractéristiques')
 | 
			
		||||
    plt.title("Features' Importance")
 | 
			
		||||
    plt.grid(True)
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("Features_", type_of_activity, model)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def draw_prob_distribution(y_pred_prob, model):
 | 
			
		||||
    plt.figure(figsize=(8, 6))
 | 
			
		||||
    plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
 | 
			
		||||
    
 | 
			
		||||
    plt.xlim(0, 1)
 | 
			
		||||
    plt.ylim(0, None)
 | 
			
		||||
    
 | 
			
		||||
    plt.title('Histogramme des probabilités pour la classe 1')
 | 
			
		||||
    plt.xlabel('Probability')
 | 
			
		||||
    plt.ylabel('Frequency')
 | 
			
		||||
    plt.grid(True)
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("prob_dist_", type_of_activity, model)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def draw_prob_distribution_companies(y_pred_prob, model):
 | 
			
		||||
    test = dataset_test.copy()
 | 
			
		||||
    test['probability to buy'] = y_pred_prob
 | 
			
		||||
    test['company'] = test['customer_id'].str.split('_', expand=True)[0]
 | 
			
		||||
    sns.histplot(data=test, x='probability to buy', hue='company', element='step',
 | 
			
		||||
             stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
 | 
			
		||||
    plt.xlim(0, 1)
 | 
			
		||||
    plt.ylim(0, None)
 | 
			
		||||
    plt.title('Histogram of probabilities for class 1 by company')
 | 
			
		||||
    plt.xlabel('Probability')
 | 
			
		||||
    plt.ylabel('Frequency')
 | 
			
		||||
    plt.grid(True)
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("prob_dist_companies_", type_of_activity, model)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
 | 
			
		||||
    pipeline = Pipeline(steps=[
 | 
			
		||||
    ('preprocessor', preproc),
 | 
			
		||||
    ('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
 | 
			
		||||
                                  max_iter=5000, n_jobs=-1))  
 | 
			
		||||
])
 | 
			
		||||
    pipeline.fit(X_train, y_train)
 | 
			
		||||
 | 
			
		||||
    y_pred = pipeline.predict(X_test)
 | 
			
		||||
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
 | 
			
		||||
    
 | 
			
		||||
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
 | 
			
		||||
    model = "LogisticRegression_Benchmark"
 | 
			
		||||
    result = pd.DataFrame({"Model" : [model],
 | 
			
		||||
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
 | 
			
		||||
                       "Recall" : [recall_score(y_test, y_pred)],
 | 
			
		||||
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
 | 
			
		||||
                       "AUC" : [auc(fpr, tpr)]}
 | 
			
		||||
                       )
 | 
			
		||||
    model_result = pd.concat([model_result, result])
 | 
			
		||||
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
 | 
			
		||||
    
 | 
			
		||||
    draw_confusion_matrix(y_test, y_pred, model)
 | 
			
		||||
    draw_roc_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
 | 
			
		||||
    draw_prob_distribution(y_pred_prob, model)
 | 
			
		||||
    draw_prob_distribution_companies(y_pred_prob, model)
 | 
			
		||||
    draw_calibration_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline)
 | 
			
		||||
    return model_result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
 | 
			
		||||
    y_train = y_train['y_has_purchased']
 | 
			
		||||
    param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
 | 
			
		||||
              'LogisticRegression_cv__penalty': ['l1', 'l2'],
 | 
			
		||||
               'LogisticRegression_cv__class_weight': ['balanced', weight_dict]} 
 | 
			
		||||
    pipeline = Pipeline(steps=[
 | 
			
		||||
    ('preprocessor', preproc),
 | 
			
		||||
    ('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))  
 | 
			
		||||
])
 | 
			
		||||
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
 | 
			
		||||
                          n_jobs=-1)
 | 
			
		||||
 | 
			
		||||
    grid_search.fit(X_train, y_train)
 | 
			
		||||
    y_pred = grid_search.predict(X_test)
 | 
			
		||||
    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
 | 
			
		||||
    best_pipeline = grid_search.best_estimator_
 | 
			
		||||
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
 | 
			
		||||
    model = "LogisticRegression_cv"
 | 
			
		||||
    result = pd.DataFrame({"Model" : [model],
 | 
			
		||||
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
 | 
			
		||||
                       "Recall" : [recall_score(y_test, y_pred)],
 | 
			
		||||
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
 | 
			
		||||
                       "AUC" : [auc(fpr, tpr)]}
 | 
			
		||||
                       )
 | 
			
		||||
    model_result = pd.concat([model_result, result])
 | 
			
		||||
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
 | 
			
		||||
    
 | 
			
		||||
    draw_confusion_matrix(y_test, y_pred, model)
 | 
			
		||||
    draw_roc_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    draw_features_importance(best_pipeline, 'LogisticRegression_cv')
 | 
			
		||||
    draw_prob_distribution(y_pred_prob, model)
 | 
			
		||||
    draw_prob_distribution_companies(y_pred_prob, model)
 | 
			
		||||
    draw_calibration_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search)
 | 
			
		||||
    return model_result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
 | 
			
		||||
    pipeline = Pipeline(steps=[
 | 
			
		||||
    ('preprocessor', preproc),
 | 
			
		||||
    ('randomF', RandomForestClassifier(class_weight = weight_dict,
 | 
			
		||||
                                  n_jobs=-1))  
 | 
			
		||||
])
 | 
			
		||||
    pipeline.fit(X_train, y_train)
 | 
			
		||||
 | 
			
		||||
    y_pred = pipeline.predict(X_test)
 | 
			
		||||
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
 | 
			
		||||
    
 | 
			
		||||
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
 | 
			
		||||
    model = "randomF"
 | 
			
		||||
    result = pd.DataFrame({"Model" : [model],
 | 
			
		||||
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
 | 
			
		||||
                       "Recall" : [recall_score(y_test, y_pred)],
 | 
			
		||||
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
 | 
			
		||||
                       "AUC" : [auc(fpr, tpr)]}
 | 
			
		||||
                       )
 | 
			
		||||
    model_result = pd.concat([model_result, result])
 | 
			
		||||
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
 | 
			
		||||
    
 | 
			
		||||
    draw_confusion_matrix(y_test, y_pred, model)
 | 
			
		||||
    draw_roc_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    draw_features_importance(pipeline, 'randomF', randomF=True)
 | 
			
		||||
    draw_prob_distribution(y_pred_prob, model)
 | 
			
		||||
    draw_prob_distribution_companies(y_pred_prob, model)
 | 
			
		||||
    draw_calibration_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline)
 | 
			
		||||
    return model_result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
 | 
			
		||||
    y_train = y_train['y_has_purchased']
 | 
			
		||||
    param_grid = {
 | 
			
		||||
    'randomF_cv__n_estimators': [100, 300],
 | 
			
		||||
    'randomF_cv__max_features': ['sqrt', 'log2'],
 | 
			
		||||
    'randomF_cv__min_samples_split': [2, 10],
 | 
			
		||||
    'randomF_cv__min_samples_leaf': [1, 4],
 | 
			
		||||
    'randomF_cv__class_weight': [weight_dict]
 | 
			
		||||
}
 | 
			
		||||
    pipeline = Pipeline(steps=[
 | 
			
		||||
    ('preprocessor', preproc),
 | 
			
		||||
    ('randomF_cv', RandomForestClassifier(n_jobs=-1))  
 | 
			
		||||
])
 | 
			
		||||
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
 | 
			
		||||
                          n_jobs=-1)
 | 
			
		||||
 | 
			
		||||
    grid_search.fit(X_train, y_train)
 | 
			
		||||
    y_pred = grid_search.predict(X_test)
 | 
			
		||||
    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
 | 
			
		||||
    best_pipeline = grid_search.best_estimator_
 | 
			
		||||
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
 | 
			
		||||
    model = "randomF_cv"
 | 
			
		||||
    result = pd.DataFrame({"Model" : [model],
 | 
			
		||||
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
 | 
			
		||||
                       "Recall" : [recall_score(y_test, y_pred)],
 | 
			
		||||
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
 | 
			
		||||
                       "AUC" : [auc(fpr, tpr)]}
 | 
			
		||||
                       )
 | 
			
		||||
    model_result = pd.concat([model_result, result])
 | 
			
		||||
    #compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
 | 
			
		||||
    
 | 
			
		||||
    draw_confusion_matrix(y_test, y_pred, model)
 | 
			
		||||
    draw_roc_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
 | 
			
		||||
    draw_prob_distribution(y_pred_prob, model)
 | 
			
		||||
    draw_prob_distribution_companies(y_pred_prob, model)
 | 
			
		||||
    draw_calibration_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    save_model_s3('randomF_cv', type_of_activity, model, gridsearch)
 | 
			
		||||
    return model_result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
 | 
			
		||||
    unique_classes, counts = np.unique(y_train, return_counts=True)
 | 
			
		||||
    class_priors = counts / counts.sum()
 | 
			
		||||
    pipeline = Pipeline(steps=[
 | 
			
		||||
    ('preprocessor', preproc),
 | 
			
		||||
    ('Naive_Bayes', GaussianNB(priors=class_priors))  
 | 
			
		||||
])
 | 
			
		||||
    pipeline.fit(X_train, y_train)
 | 
			
		||||
 | 
			
		||||
    y_pred = pipeline.predict(X_test)
 | 
			
		||||
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
 | 
			
		||||
    
 | 
			
		||||
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
 | 
			
		||||
    model = "Naive_Bayes"
 | 
			
		||||
    result = pd.DataFrame({"Model" : [model],
 | 
			
		||||
                       "Accuracy" : [accuracy_score(y_test, y_pred)],
 | 
			
		||||
                       "Recall" : [recall_score(y_test, y_pred)],
 | 
			
		||||
                       "F1_score" : [f1_score(y_test, y_pred, average="macro")],
 | 
			
		||||
                       "AUC" : [auc(fpr, tpr)]}
 | 
			
		||||
                       )
 | 
			
		||||
    model_result = pd.concat([model_result, result])
 | 
			
		||||
    draw_confusion_matrix(y_test, y_pred, model)
 | 
			
		||||
    draw_roc_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    draw_prob_distribution(y_pred_prob, model)
 | 
			
		||||
    draw_calibration_curve(X_test, y_pred_prob, model)
 | 
			
		||||
    save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline)
 | 
			
		||||
    return model_result
 | 
			
		||||
							
								
								
									
										27
									
								
								utils_segmentation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								utils_segmentation.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,27 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
import os
 | 
			
		||||
import io
 | 
			
		||||
import s3fs
 | 
			
		||||
import re
 | 
			
		||||
import pickle
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_model(type_of_activity, model):
 | 
			
		||||
    BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
 | 
			
		||||
    filename = model + '.pkl'
 | 
			
		||||
    file_path = BUCKET + filename
 | 
			
		||||
    with fs.open(file_path, mode="rb") as f:
 | 
			
		||||
        model_bytes = f.read()
 | 
			
		||||
 | 
			
		||||
    model = pickle.loads(model_bytes)
 | 
			
		||||
    return model
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_test_file(type_of_activity):
 | 
			
		||||
    file_path_test = f"projet-bdc2324-team1/Generalization/{type_of_activity}/Test_set.csv"
 | 
			
		||||
    with fs.open(file_path_test, mode="rb") as file_in:
 | 
			
		||||
        dataset_test = pd.read_csv(file_in, sep=",")
 | 
			
		||||
    return dataset_test
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										358
									
								
								utils_stat_desc.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										358
									
								
								utils_stat_desc.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,358 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import os
 | 
			
		||||
import s3fs
 | 
			
		||||
import io
 | 
			
		||||
import warnings
 | 
			
		||||
from datetime import date, timedelta, datetime
 | 
			
		||||
import numpy as np
 | 
			
		||||
import matplotlib.pyplot as plt
 | 
			
		||||
import matplotlib.dates as mdates
 | 
			
		||||
import seaborn as sns
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_files(nb_compagnie):
 | 
			
		||||
    customer = pd.DataFrame()
 | 
			
		||||
    campaigns_brut = pd.DataFrame()
 | 
			
		||||
    campaigns_kpi = pd.DataFrame()
 | 
			
		||||
    products = pd.DataFrame()
 | 
			
		||||
    tickets = pd.DataFrame()
 | 
			
		||||
    
 | 
			
		||||
    # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
 | 
			
		||||
    for directory_path in nb_compagnie:
 | 
			
		||||
        df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
 | 
			
		||||
        df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
 | 
			
		||||
        df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
 | 
			
		||||
        df_target_information = display_input_databases(directory_path, file_name = "target_information")
 | 
			
		||||
        
 | 
			
		||||
        df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC')) 
 | 
			
		||||
        df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
 | 
			
		||||
        df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
 | 
			
		||||
    
 | 
			
		||||
            
 | 
			
		||||
    # creation de la colonne Number compagnie, qui permettra d'agréger les résultats
 | 
			
		||||
        df_tickets_kpi["number_company"]=int(directory_path)
 | 
			
		||||
        df_campaigns_brut["number_company"]=int(directory_path)
 | 
			
		||||
        df_campaigns_kpi["number_company"]=int(directory_path)
 | 
			
		||||
        df_customerplus_clean["number_company"]=int(directory_path)
 | 
			
		||||
        df_target_information["number_company"]=int(directory_path)
 | 
			
		||||
    
 | 
			
		||||
    # Traitement des index
 | 
			
		||||
        df_tickets_kpi["customer_id"]= directory_path + '_' +  df_tickets_kpi['customer_id'].astype('str')
 | 
			
		||||
        df_campaigns_brut["customer_id"]= directory_path + '_' +  df_campaigns_brut['customer_id'].astype('str')
 | 
			
		||||
        df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
 | 
			
		||||
        df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
 | 
			
		||||
        df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
 | 
			
		||||
    
 | 
			
		||||
    # Concaténation
 | 
			
		||||
        customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
 | 
			
		||||
        campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
 | 
			
		||||
        campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) 
 | 
			
		||||
        tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
 | 
			
		||||
        products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
 | 
			
		||||
 | 
			
		||||
    return customer, campaigns_kpi, campaigns_brut, tickets, products
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def save_file_s3(File_name, type_of_activity):
 | 
			
		||||
    image_buffer = io.BytesIO()
 | 
			
		||||
    plt.savefig(image_buffer, format='png')
 | 
			
		||||
    image_buffer.seek(0)
 | 
			
		||||
    FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
 | 
			
		||||
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
 | 
			
		||||
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
 | 
			
		||||
        s3_file.write(image_buffer.read())
 | 
			
		||||
    plt.close()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def outlier_detection(tickets, company_list, show_diagram=False):
 | 
			
		||||
 | 
			
		||||
    outlier_list = list()
 | 
			
		||||
    
 | 
			
		||||
    for company in company_list:
 | 
			
		||||
        total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
 | 
			
		||||
        total_amount_share['CA'] = total_amount_share['total_amount'].sum()
 | 
			
		||||
        total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
 | 
			
		||||
        
 | 
			
		||||
        total_amount_share_index = total_amount_share.set_index('customer_id')
 | 
			
		||||
        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
 | 
			
		||||
        #print('df circulaire : ', df_circulaire.head())
 | 
			
		||||
        top = df_circulaire[:1]
 | 
			
		||||
        #print('top : ', top)
 | 
			
		||||
        outlier_list.append(top.index[0])
 | 
			
		||||
        rest = df_circulaire[1:]
 | 
			
		||||
    
 | 
			
		||||
        rest_sum = rest.sum()
 | 
			
		||||
        
 | 
			
		||||
        new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
 | 
			
		||||
        
 | 
			
		||||
        if show_diagram:
 | 
			
		||||
            plt.figure(figsize=(3, 3))
 | 
			
		||||
            plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
 | 
			
		||||
            plt.axis('equal')
 | 
			
		||||
            plt.title(f'Répartition des montants totaux pour la compagnie {company}')
 | 
			
		||||
            plt.show()
 | 
			
		||||
    return outlier_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def valid_customer_detection(products, campaigns_brut):
 | 
			
		||||
    products_valid = products[products['purchase_date']>="2021-05-01"]
 | 
			
		||||
    consumer_valid_product = products_valid['customer_id'].to_list()
 | 
			
		||||
 | 
			
		||||
    campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
 | 
			
		||||
    consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
 | 
			
		||||
 | 
			
		||||
    consumer_valid = consumer_valid_product + consumer_valid_campaigns
 | 
			
		||||
    return consumer_valid 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def identify_purchase_during_target_periode(products):
 | 
			
		||||
    products_target_period = products[(products['purchase_date']>="2022-11-01")
 | 
			
		||||
    & (products['purchase_date']<="2023-11-01")]
 | 
			
		||||
    customer_target_period = products_target_period['customer_id'].to_list()
 | 
			
		||||
    return customer_target_period
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
def remove_elements(lst, elements_to_remove):
 | 
			
		||||
    return ''.join([x for x in lst if x not in elements_to_remove])
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def compute_nb_clients(customer, type_of_activity):
 | 
			
		||||
    company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
 | 
			
		||||
    plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
 | 
			
		||||
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Number of clients (thousands)")
 | 
			
		||||
    plt.title(f"Number of clients for {type_of_activity}")
 | 
			
		||||
    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("nb_clients_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def maximum_price_paid(customer, type_of_activity):
 | 
			
		||||
    company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
 | 
			
		||||
    plt.bar(company_max_price["number_company"], company_max_price["max_price"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Maximal price of a ticket Prix")
 | 
			
		||||
    plt.title(f"Maximal price of a ticket for {type_of_activity}")
 | 
			
		||||
    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("Maximal_price_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def mailing_consent(customer, type_of_activity):
 | 
			
		||||
    mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
 | 
			
		||||
 | 
			
		||||
    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
 | 
			
		||||
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel('Consent')
 | 
			
		||||
    plt.title(f'Consent of mailing for {type_of_activity}')
 | 
			
		||||
    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("mailing_consent_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def mailing_consent_by_target(customer):
 | 
			
		||||
    df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
 | 
			
		||||
    # Création du barplot groupé
 | 
			
		||||
    fig, ax = plt.subplots(figsize=(10, 6))
 | 
			
		||||
    
 | 
			
		||||
    categories = df_graph["number_company"].unique()
 | 
			
		||||
    bar_width = 0.35
 | 
			
		||||
    bar_positions = np.arange(len(categories))
 | 
			
		||||
    
 | 
			
		||||
    # Grouper les données par label et créer les barres groupées
 | 
			
		||||
    for label in df_graph["has_purchased_target_period"].unique():
 | 
			
		||||
        label_data = df_graph[df_graph['has_purchased_target_period'] == label]
 | 
			
		||||
        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
 | 
			
		||||
    
 | 
			
		||||
        label_printed = "purchased" if label else "no purchase"
 | 
			
		||||
        ax.bar(bar_positions, values, bar_width, label=label_printed)
 | 
			
		||||
    
 | 
			
		||||
        # Mise à jour des positions des barres pour le prochain groupe
 | 
			
		||||
        bar_positions = [pos + bar_width for pos in bar_positions]
 | 
			
		||||
    
 | 
			
		||||
    # Ajout des étiquettes, de la légende, etc.
 | 
			
		||||
    ax.set_xlabel('Company')
 | 
			
		||||
    ax.set_ylabel('Consent')
 | 
			
		||||
    ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
 | 
			
		||||
    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
 | 
			
		||||
    ax.set_xticklabels(categories)
 | 
			
		||||
    ax.legend()
 | 
			
		||||
    
 | 
			
		||||
    # Affichage du plot
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("mailing_consent_target_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def gender_bar(customer, type_of_activity):
 | 
			
		||||
    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
 | 
			
		||||
    
 | 
			
		||||
    # Création du barplot
 | 
			
		||||
    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
 | 
			
		||||
    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
 | 
			
		||||
            bottom = company_genders["gender_male"], label = "Femme")
 | 
			
		||||
    plt.bar(company_genders["number_company"], company_genders["gender_other"], 
 | 
			
		||||
            bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Gender")
 | 
			
		||||
    plt.title(f"Gender of Customer for {type_of_activity}")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("gender_bar_", type_of_activity)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def country_bar(customer, type_of_activity):
 | 
			
		||||
    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
 | 
			
		||||
    plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of French Customer")
 | 
			
		||||
    plt.title(f"Share of French Customer for {type_of_activity}")
 | 
			
		||||
    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("country_bar_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def lazy_customer_plot(campaigns_kpi, type_of_activity):
 | 
			
		||||
    company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
 | 
			
		||||
    plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of Customers who did not open mail")
 | 
			
		||||
    plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
 | 
			
		||||
    plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("lazy_customer_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def campaigns_effectiveness(customer, type_of_activity):
 | 
			
		||||
 | 
			
		||||
    campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
 | 
			
		||||
 | 
			
		||||
    fig, ax = plt.subplots(figsize=(10, 6))
 | 
			
		||||
    
 | 
			
		||||
    categories = campaigns_effectiveness["number_company"].unique()
 | 
			
		||||
    bar_width = 0.35
 | 
			
		||||
    bar_positions = np.arange(len(categories))
 | 
			
		||||
    
 | 
			
		||||
    # Grouper les données par label et créer les barres groupées
 | 
			
		||||
    for label in campaigns_effectiveness["has_purchased_target_period"].unique():
 | 
			
		||||
        label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
 | 
			
		||||
        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
 | 
			
		||||
    
 | 
			
		||||
        label_printed = "purchased" if label else "no purchase"
 | 
			
		||||
        ax.bar(bar_positions, values, bar_width, label=label_printed)
 | 
			
		||||
    
 | 
			
		||||
        # Mise à jour des positions des barres pour le prochain groupe
 | 
			
		||||
        bar_positions = [pos + bar_width for pos in bar_positions]
 | 
			
		||||
    
 | 
			
		||||
    # Ajout des étiquettes, de la légende, etc.
 | 
			
		||||
    ax.set_xlabel('Company')
 | 
			
		||||
    ax.set_ylabel('Consent')
 | 
			
		||||
    ax.set_title(f"Number of Customers who have consent to received mails for {type_of_activity} dependy on target")
 | 
			
		||||
    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
 | 
			
		||||
    ax.set_xticklabels(categories)
 | 
			
		||||
    ax.legend()
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("campaigns_effectiveness_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def sale_dynamics(products, campaigns_brut, type_of_activity):
 | 
			
		||||
    purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
 | 
			
		||||
    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
 | 
			
		||||
    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
 | 
			
		||||
    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
 | 
			
		||||
    
 | 
			
		||||
    # Mois du premier mails
 | 
			
		||||
    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
 | 
			
		||||
    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
 | 
			
		||||
    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
 | 
			
		||||
    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
 | 
			
		||||
    
 | 
			
		||||
    # Fusion 
 | 
			
		||||
    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
 | 
			
		||||
                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
 | 
			
		||||
    
 | 
			
		||||
    # Mois à partir duquel le client est considere comme connu
 | 
			
		||||
    
 | 
			
		||||
    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
 | 
			
		||||
    
 | 
			
		||||
    # Nombre de commande par mois
 | 
			
		||||
    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
 | 
			
		||||
    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
 | 
			
		||||
    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
 | 
			
		||||
    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
 | 
			
		||||
    
 | 
			
		||||
    # Nombre de commande par mois par type de client
 | 
			
		||||
    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
 | 
			
		||||
    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
 | 
			
		||||
    
 | 
			
		||||
    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
 | 
			
		||||
    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
 | 
			
		||||
    
 | 
			
		||||
    # Graphique en nombre de commande
 | 
			
		||||
    purchases_graph = nb_purchases_graph
 | 
			
		||||
    
 | 
			
		||||
    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
 | 
			
		||||
    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
 | 
			
		||||
    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
 | 
			
		||||
    
 | 
			
		||||
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
 | 
			
		||||
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
 | 
			
		||||
            bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
    # commande pr afficher slt
 | 
			
		||||
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Month')
 | 
			
		||||
    plt.ylabel("Number of Sales")
 | 
			
		||||
    plt.title(f"Number of Sales for {type_of_activity}")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("sale_dynamics_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def tickets_internet(tickets, type_of_activity):
 | 
			
		||||
    nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
 | 
			
		||||
 | 
			
		||||
    plt.bar(nb_tickets_internet["number_company"],  nb_tickets_internet["prop_purchases_internet"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of Purchases Bought Online")
 | 
			
		||||
    plt.title(f"Share of Purchases Bought Online for {type_of_activity}")
 | 
			
		||||
    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("tickets_internet_", type_of_activity)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def already_bought_online(tickets, type_of_activity):
 | 
			
		||||
    nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
 | 
			
		||||
                                                                        'customer_id' : 'nunique'}
 | 
			
		||||
                                                                        ).reset_index())
 | 
			
		||||
    nb_consumers_online["Share_consumers_internet"] = nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"]
 | 
			
		||||
 | 
			
		||||
    plt.bar(nb_consumers_online["number_company"],  nb_consumers_online["Share_consumers_internet"])
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of Customer who Bought Online at least once")
 | 
			
		||||
    plt.title(f"Share of Customer who Bought Online at least once for {type_of_activity}")
 | 
			
		||||
    plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("First_buy_internet_", type_of_activity)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def box_plot_price_tickets(tickets, type_of_activity):
 | 
			
		||||
    price_tickets = tickets[(tickets['total_amount'] > 0)]
 | 
			
		||||
    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
 | 
			
		||||
    plt.title(f"Box plot of price tickets for {type_of_activity}")
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("box_plot_price_tickets_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user