add function description
This commit is contained in:
parent
09f4bd3fe4
commit
df4c28bdd8
15
utils_ml.py
15
utils_ml.py
|
@ -49,6 +49,9 @@ def load_train_test(type_of_activity, type_of_model):
|
||||||
|
|
||||||
|
|
||||||
def save_file_s3(File_name, type_of_activity, type_of_model, model):
|
def save_file_s3(File_name, type_of_activity, type_of_model, model):
|
||||||
|
"""
|
||||||
|
save plot into s3 storage
|
||||||
|
"""
|
||||||
image_buffer = io.BytesIO()
|
image_buffer = io.BytesIO()
|
||||||
plt.savefig(image_buffer, format='png')
|
plt.savefig(image_buffer, format='png')
|
||||||
image_buffer.seek(0)
|
image_buffer.seek(0)
|
||||||
|
@ -60,6 +63,9 @@ def save_file_s3(File_name, type_of_activity, type_of_model, model):
|
||||||
|
|
||||||
|
|
||||||
def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
|
def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
|
||||||
|
"""
|
||||||
|
save result into s3 storage
|
||||||
|
"""
|
||||||
if model_path:
|
if model_path:
|
||||||
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
|
||||||
else:
|
else:
|
||||||
|
@ -69,6 +75,9 @@ def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, m
|
||||||
|
|
||||||
|
|
||||||
def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
|
def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
|
||||||
|
"""
|
||||||
|
save model into pickle file
|
||||||
|
"""
|
||||||
model_bytes = pickle.dumps(classifier)
|
model_bytes = pickle.dumps(classifier)
|
||||||
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
|
||||||
|
@ -88,6 +97,9 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
|
||||||
|
|
||||||
|
|
||||||
def features_target_split(dataset_train, dataset_test):
|
def features_target_split(dataset_train, dataset_test):
|
||||||
|
"""
|
||||||
|
return train and test set
|
||||||
|
"""
|
||||||
features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
|
features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
|
||||||
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
||||||
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
|
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
|
||||||
|
@ -105,6 +117,9 @@ def features_target_split(dataset_train, dataset_test):
|
||||||
|
|
||||||
|
|
||||||
def preprocess(type_of_model, type_of_activity):
|
def preprocess(type_of_model, type_of_activity):
|
||||||
|
"""
|
||||||
|
preprocess variables before running machine learning pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
|
numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
|
||||||
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
|
|
||||||
|
|
||||||
def load_files(nb_compagnie):
|
def load_files(nb_compagnie):
|
||||||
|
"""
|
||||||
|
load and preprocess dataframes
|
||||||
|
"""
|
||||||
|
|
||||||
customer = pd.DataFrame()
|
customer = pd.DataFrame()
|
||||||
campaigns_brut = pd.DataFrame()
|
campaigns_brut = pd.DataFrame()
|
||||||
campaigns_kpi = pd.DataFrame()
|
campaigns_kpi = pd.DataFrame()
|
||||||
|
@ -8,7 +12,6 @@ def load_files(nb_compagnie):
|
||||||
tickets = pd.DataFrame()
|
tickets = pd.DataFrame()
|
||||||
targets = pd.DataFrame()
|
targets = pd.DataFrame()
|
||||||
|
|
||||||
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
|
|
||||||
for directory_path in nb_compagnie:
|
for directory_path in nb_compagnie:
|
||||||
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
|
||||||
df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
||||||
|
@ -26,7 +29,7 @@ def load_files(nb_compagnie):
|
||||||
targets_columns.remove('customer_id')
|
targets_columns.remove('customer_id')
|
||||||
df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
|
df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
|
||||||
|
|
||||||
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
|
# Create company identifier
|
||||||
df_tickets_kpi["number_company"]=int(directory_path)
|
df_tickets_kpi["number_company"]=int(directory_path)
|
||||||
df_campaigns_brut["number_company"]=int(directory_path)
|
df_campaigns_brut["number_company"]=int(directory_path)
|
||||||
df_campaigns_kpi["number_company"]=int(directory_path)
|
df_campaigns_kpi["number_company"]=int(directory_path)
|
||||||
|
@ -34,7 +37,7 @@ def load_files(nb_compagnie):
|
||||||
df_target_information["number_company"]=int(directory_path)
|
df_target_information["number_company"]=int(directory_path)
|
||||||
df_target_KPI["number_company"]=int(directory_path)
|
df_target_KPI["number_company"]=int(directory_path)
|
||||||
|
|
||||||
# Traitement des index
|
# Clean index
|
||||||
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
|
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
|
||||||
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
|
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
|
||||||
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
|
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
|
||||||
|
@ -51,7 +54,7 @@ def load_files(nb_compagnie):
|
||||||
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
|
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
|
||||||
|
|
||||||
|
|
||||||
# Concaténation
|
# Concatenation
|
||||||
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
|
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
|
||||||
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
|
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
|
||||||
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
|
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
|
||||||
|
@ -73,6 +76,9 @@ def remove_outlier_total_amount(tickets):
|
||||||
|
|
||||||
|
|
||||||
def save_file_s3(File_name, type_of_activity):
|
def save_file_s3(File_name, type_of_activity):
|
||||||
|
"""
|
||||||
|
save plots into s3 storage
|
||||||
|
"""
|
||||||
image_buffer = io.BytesIO()
|
image_buffer = io.BytesIO()
|
||||||
plt.savefig(image_buffer, format='png', pad_inches=1, bbox_inches="tight", dpi = 150)
|
plt.savefig(image_buffer, format='png', pad_inches=1, bbox_inches="tight", dpi = 150)
|
||||||
image_buffer.seek(0)
|
image_buffer.seek(0)
|
||||||
|
@ -84,7 +90,9 @@ def save_file_s3(File_name, type_of_activity):
|
||||||
|
|
||||||
|
|
||||||
def outlier_detection(tickets, company_list, show_diagram=False):
|
def outlier_detection(tickets, company_list, show_diagram=False):
|
||||||
|
"""
|
||||||
|
detect anonymous customers
|
||||||
|
"""
|
||||||
outlier_list = list()
|
outlier_list = list()
|
||||||
|
|
||||||
for company in company_list:
|
for company in company_list:
|
||||||
|
@ -114,6 +122,9 @@ def outlier_detection(tickets, company_list, show_diagram=False):
|
||||||
|
|
||||||
|
|
||||||
def valid_customer_detection(products, campaigns_brut):
|
def valid_customer_detection(products, campaigns_brut):
|
||||||
|
"""
|
||||||
|
identify customer that are in our time perimeter
|
||||||
|
"""
|
||||||
products_valid = products[products['purchase_date']>="2021-05-01"]
|
products_valid = products[products['purchase_date']>="2021-05-01"]
|
||||||
consumer_valid_product = products_valid['customer_id'].to_list()
|
consumer_valid_product = products_valid['customer_id'].to_list()
|
||||||
|
|
||||||
|
@ -125,6 +136,9 @@ def valid_customer_detection(products, campaigns_brut):
|
||||||
|
|
||||||
|
|
||||||
def identify_purchase_during_target_periode(products):
|
def identify_purchase_during_target_periode(products):
|
||||||
|
"""
|
||||||
|
identify customer who purchased ticket during the target period
|
||||||
|
"""
|
||||||
products_target_period = products[(products['purchase_date']>="2022-11-01")
|
products_target_period = products[(products['purchase_date']>="2022-11-01")
|
||||||
& (products['purchase_date']<="2023-11-01")]
|
& (products['purchase_date']<="2023-11-01")]
|
||||||
customer_target_period = products_target_period['customer_id'].to_list()
|
customer_target_period = products_target_period['customer_id'].to_list()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user