BDC-team-1/utils_stat_desc.py

436 lines
21 KiB
Python
Raw Permalink Normal View History

2024-03-19 12:46:04 +01:00
import pandas as pd
import os
import s3fs
import io
import warnings
from datetime import date, timedelta, datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
def load_files(nb_compagnie):
customer = pd.DataFrame()
campaigns_brut = pd.DataFrame()
campaigns_kpi = pd.DataFrame()
products = pd.DataFrame()
tickets = pd.DataFrame()
2024-03-27 19:39:54 +01:00
targets = pd.DataFrame()
2024-03-19 12:46:04 +01:00
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
for directory_path in nb_compagnie:
2024-03-19 14:54:31 +01:00
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_input_databases(directory_path, file_name = "target_information")
2024-03-19 12:46:04 +01:00
2024-03-20 09:33:56 +01:00
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC'))
2024-03-19 12:46:04 +01:00
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
2024-03-27 19:39:54 +01:00
df_target_KPI = targets_KPI(df_target = df_target_information)
# Merge and
df_target_KPI = pd.merge(df_customerplus_clean_0[['customer_id']], df_target_KPI, how = 'left', on = 'customer_id')
targets_columns = list(df_target_KPI.columns)
targets_columns.remove('customer_id')
df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
2024-03-19 12:46:04 +01:00
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
df_tickets_kpi["number_company"]=int(directory_path)
df_campaigns_brut["number_company"]=int(directory_path)
df_campaigns_kpi["number_company"]=int(directory_path)
df_customerplus_clean["number_company"]=int(directory_path)
df_target_information["number_company"]=int(directory_path)
2024-03-27 19:39:54 +01:00
df_target_KPI["number_company"]=int(directory_path)
2024-03-19 12:46:04 +01:00
# Traitement des index
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
2024-03-21 11:47:40 +01:00
# Remove companies' outliers
df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
# harmonize set of customers across databases
customer_id = df_tickets_kpi['customer_id'].to_list()
for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
dataset = dataset[dataset['customer_id'].isin(customer_id)]
2024-03-27 19:39:54 +01:00
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
2024-03-19 12:46:04 +01:00
# Concaténation
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
2024-03-27 19:39:54 +01:00
targets = pd.concat([targets, df_target_KPI], ignore_index=True)
2024-03-19 12:46:04 +01:00
2024-03-27 19:39:54 +01:00
return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
2024-03-19 12:46:04 +01:00
2024-03-21 11:47:40 +01:00
def remove_outlier_total_amount(tickets):
Q1 = tickets['total_amount'].quantile(0.25)
Q3 = tickets['total_amount'].quantile(0.75)
IQR = Q3 - Q1
upper = Q3 +1.5*IQR
outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
tickets = tickets[~tickets['customer_id'].isin(outliers)]
return tickets
2024-03-19 12:46:04 +01:00
def save_file_s3(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
2024-03-29 11:14:14 +01:00
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_0_Descriptive_Statistics/{type_of_activity}/"
2024-03-19 12:46:04 +01:00
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def outlier_detection(tickets, company_list, show_diagram=False):
outlier_list = list()
for company in company_list:
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
total_amount_share_index = total_amount_share.set_index('customer_id')
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
#print('df circulaire : ', df_circulaire.head())
top = df_circulaire[:1]
#print('top : ', top)
outlier_list.append(top.index[0])
rest = df_circulaire[1:]
rest_sum = rest.sum()
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
if show_diagram:
plt.figure(figsize=(3, 3))
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
plt.axis('equal')
plt.title(f'Répartition des montants totaux pour la compagnie {company}')
plt.show()
return outlier_list
def valid_customer_detection(products, campaigns_brut):
products_valid = products[products['purchase_date']>="2021-05-01"]
consumer_valid_product = products_valid['customer_id'].to_list()
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
consumer_valid = consumer_valid_product + consumer_valid_campaigns
return consumer_valid
def identify_purchase_during_target_periode(products):
products_target_period = products[(products['purchase_date']>="2022-11-01")
& (products['purchase_date']<="2023-11-01")]
customer_target_period = products_target_period['customer_id'].to_list()
return customer_target_period
def remove_elements(lst, elements_to_remove):
return ''.join([x for x in lst if x not in elements_to_remove])
def compute_nb_clients(customer, type_of_activity):
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
plt.xlabel('Company')
plt.ylabel("Number of clients (thousands)")
2024-03-21 09:16:29 +01:00
plt.title(f"Number of clients Across {type_of_activity} Companies")
2024-03-19 12:46:04 +01:00
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
plt.show()
save_file_s3("nb_clients_", type_of_activity)
def maximum_price_paid(customer, type_of_activity):
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
2024-03-21 09:16:29 +01:00
plt.xlabel('Company Number')
2024-03-19 12:46:04 +01:00
plt.ylabel("Maximal price of a ticket Prix")
2024-03-21 09:16:29 +01:00
plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies")
2024-03-19 12:46:04 +01:00
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
plt.show()
save_file_s3("Maximal_price_", type_of_activity)
2024-03-21 11:47:40 +01:00
def target_proportion(customer, type_of_activity):
df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
'customer_id' : 'nunique'}).reset_index()
df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
plt.xlabel('Company Number')
plt.ylabel('Share (%)')
plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
plt.show()
save_file_s3("share_target_", type_of_activity)
2024-03-19 12:46:04 +01:00
def mailing_consent(customer, type_of_activity):
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
2024-03-21 09:16:29 +01:00
mailing_consent["opt_in"] *= 100
2024-03-19 12:46:04 +01:00
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
2024-03-21 09:16:29 +01:00
plt.xlabel('Company Number')
plt.ylabel('Mailing Consent (%)')
plt.title(f'Consent of mailing Across {type_of_activity} Companies')
2024-03-19 12:46:04 +01:00
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
plt.show()
save_file_s3("mailing_consent_", type_of_activity)
def mailing_consent_by_target(customer):
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
# Création du barplot groupé
fig, ax = plt.subplots(figsize=(10, 6))
categories = df_graph["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
# Grouper les données par label et créer les barres groupées
for label in df_graph["has_purchased_target_period"].unique():
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
2024-03-21 09:16:29 +01:00
label_printed = "Purchase" if label else "No purchase"
2024-03-19 12:46:04 +01:00
ax.bar(bar_positions, values, bar_width, label=label_printed)
# Mise à jour des positions des barres pour le prochain groupe
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
2024-03-21 09:16:29 +01:00
ax.set_xlabel('Company Number')
ax.set_ylabel('Mailing Consent (%)')
ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies')
2024-03-19 12:46:04 +01:00
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
# Affichage du plot
plt.show()
save_file_s3("mailing_consent_target_", type_of_activity)
def gender_bar(customer, type_of_activity):
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
2024-03-21 09:16:29 +01:00
company_genders["gender_male"] *= 100
company_genders["gender_female"] *= 100
company_genders["gender_other"] *= 100
2024-03-19 12:46:04 +01:00
# Création du barplot
2024-03-21 09:16:29 +01:00
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male")
2024-03-19 12:46:04 +01:00
plt.bar(company_genders["number_company"], company_genders["gender_female"],
2024-03-21 09:16:29 +01:00
bottom = company_genders["gender_male"], label = "Female")
2024-03-19 12:46:04 +01:00
plt.bar(company_genders["number_company"], company_genders["gender_other"],
2024-03-21 09:16:29 +01:00
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown")
2024-03-19 12:46:04 +01:00
2024-03-21 09:16:29 +01:00
plt.xlabel('Company Number')
plt.ylabel("Frequency (%)")
plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies")
2024-03-19 12:46:04 +01:00
plt.legend()
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
plt.show()
save_file_s3("gender_bar_", type_of_activity)
def country_bar(customer, type_of_activity):
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
2024-03-21 09:16:29 +01:00
company_country_fr["country_fr"] *= 100
2024-03-19 12:46:04 +01:00
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
2024-03-21 09:16:29 +01:00
plt.xlabel('Company Number')
plt.ylabel("Share of French Customer (%)")
plt.title(f"Share of French Customer Across {type_of_activity} Companies")
2024-03-19 12:46:04 +01:00
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
plt.show()
save_file_s3("country_bar_", type_of_activity)
def lazy_customer_plot(campaigns_kpi, type_of_activity):
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
2024-03-21 09:16:29 +01:00
plt.xlabel('Company Number')
plt.title(f"Share of Customers who did not Open Mail Across {type_of_activity} Companies")
2024-03-19 12:46:04 +01:00
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
plt.show()
save_file_s3("lazy_customer_", type_of_activity)
def campaigns_effectiveness(customer, type_of_activity):
2024-03-20 09:33:56 +01:00
campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
2024-03-19 12:46:04 +01:00
2024-03-20 09:33:56 +01:00
fig, ax = plt.subplots(figsize=(10, 6))
2024-03-19 12:46:04 +01:00
2024-03-20 09:33:56 +01:00
categories = campaigns_effectiveness["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
# Grouper les données par label et créer les barres groupées
for label in campaigns_effectiveness["has_purchased_target_period"].unique():
label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
2024-03-21 09:16:29 +01:00
label_printed = "Purchase" if label else "No purchase"
2024-03-20 09:33:56 +01:00
ax.bar(bar_positions, values, bar_width, label=label_printed)
# Mise à jour des positions des barres pour le prochain groupe
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
2024-03-21 09:16:29 +01:00
ax.set_xlabel('Company Number')
ax.set_ylabel('Share of Consent (%)')
ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)")
2024-03-20 09:33:56 +01:00
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
2024-03-19 12:46:04 +01:00
plt.show()
save_file_s3("campaigns_effectiveness_", type_of_activity)
def sale_dynamics(products, campaigns_brut, type_of_activity):
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
# Mois du premier mails
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
# Fusion
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
# Mois à partir duquel le client est considere comme connu
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
# Nombre de commande par mois
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
# Nombre de commande par mois par type de client
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
# Graphique en nombre de commande
purchases_graph = nb_purchases_graph
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
2024-03-21 09:16:29 +01:00
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers")
2024-03-19 12:46:04 +01:00
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
2024-03-21 09:16:29 +01:00
bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers")
2024-03-19 12:46:04 +01:00
# commande pr afficher slt
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
plt.xlabel('Month')
plt.ylabel("Number of Sales")
2024-03-21 09:16:29 +01:00
plt.title(f"Number of Sales Across {type_of_activity} Companies")
2024-03-19 12:46:04 +01:00
plt.legend()
plt.show()
save_file_s3("sale_dynamics_", type_of_activity)
def tickets_internet(tickets, type_of_activity):
2024-03-20 09:33:56 +01:00
nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
2024-03-21 09:16:29 +01:00
nb_tickets_internet['prop_purchases_internet'] *=100
2024-03-20 09:33:56 +01:00
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"])
2024-03-19 12:46:04 +01:00
2024-03-21 09:16:29 +01:00
plt.xlabel('Company Number')
plt.ylabel("Share of Purchases Bought Online (%)")
plt.title(f"Share of Online Purchases Across {type_of_activity} Companies")
2024-03-19 12:46:04 +01:00
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
plt.show()
save_file_s3("tickets_internet_", type_of_activity)
2024-03-20 10:27:03 +01:00
def already_bought_online(tickets, type_of_activity):
nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
'customer_id' : 'nunique'}
).reset_index())
2024-03-21 09:16:29 +01:00
nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100
2024-03-20 10:27:03 +01:00
plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"])
2024-03-21 09:16:29 +01:00
plt.xlabel('Company Number')
plt.ylabel("Share of Customer who Bought Online at least once (%)")
plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies")
2024-03-20 10:27:03 +01:00
plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
plt.show()
save_file_s3("First_buy_internet_", type_of_activity)
2024-03-19 12:46:04 +01:00
def box_plot_price_tickets(tickets, type_of_activity):
price_tickets = tickets[(tickets['total_amount'] > 0)]
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
2024-03-21 09:16:29 +01:00
plt.title(f"Box plot of price tickets Across {type_of_activity} Companies")
2024-03-19 12:46:04 +01:00
plt.show()
save_file_s3("box_plot_price_tickets_", type_of_activity)
2024-03-27 19:39:54 +01:00
def target_description(targets, type_of_activity):
describe_target = targets.groupby('number_company').agg(
prop_target_jeune=('target_jeune', lambda x: (x.sum() / x.count())*100),
prop_target_scolaire=('target_scolaire', lambda x: (x.sum() / x.count())*100),
prop_target_entreprise=('target_entreprise', lambda x: (x.sum() / x.count())*100),
prop_target_famille=('target_famille', lambda x: (x.sum() / x.count())*100),
prop_target_optin=('target_optin', lambda x: (x.sum() / x.count())*100),
prop_target_optout=('target_optout', lambda x: (x.sum() / x.count())*100),
prop_target_newsletter=('target_newsletter', lambda x: (x.sum() / x.count())*100),
prop_target_abonne=('target_abonne', lambda x: (x.sum() / x.count())*100))
plot = describe_target.plot.bar()
# Adding a title
plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies")
# Adding labels for x and y axes
plot.set_xlabel("Company Number")
plot.set_ylabel("Target Proportion")
plot.set_xticklabels(plot.get_xticklabels(), rotation=0, horizontalalignment='center')
# Adding a legend
plot.legend(["Youth", "School", "Enterprise", "Family", "Optin", "Optout", "Newsletter", "Subscriber"], title="Target Category")
save_file_s3("target_category_proportion_", type_of_activity)