generalize statistics

This commit is contained in:
Alexis REVELLE 2024-03-14 18:35:03 +00:00
parent ac6a3b365f
commit d42e81449a
2 changed files with 348 additions and 0 deletions

View File

@ -0,0 +1,58 @@
import pandas as pd
import numpy as np
import os
import s3fs
import re
import warnings
# Ignore warning
warnings.filterwarnings('ignore')
exec(open('../0_KPI_functions.py').read())
exec(open('plot.py').read())
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5'],
'musique' : ['10', '11', '12', '13', '14']}
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
list_of_comp = companies[type_of_activity]
# Load files
customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
# Identify anonymous customer for each company and remove them from our datasets
outlier_list = outlier_detection(tickets, list_of_comp)
# Identify valid customer (customer who bought tickets after starting date)
customer_valid_list = valid_customer_detection(products)
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
for dataset in databases:
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
dataset['customer_id'] = dataset['customer_id'].isin(customer_valid_list) # keep only valid customer
# Generate graph and automatically saved them in the bucket
compute_nb_clients(customer, type_of_activity)
maximum_price_paid(customer, type_of_activity)
mailing_consent(customer, type_of_activity)
gender_bar(customer, type_of_activity)
country_bar(customer, type_of_activity)
lazy_customer_plot(campaigns_kpi, type_of_activity)
campaigns_effectiveness(customer, type_of_activity)
sale_dynamics(products, campaigns_brut, type_of_activity)
tickets_internet(tickets, type_of_activity)

View File

@ -0,0 +1,290 @@
import pandas as pd
import os
import s3fs
import warnings
from datetime import date, timedelta, datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
def load_files(nb_compagnie):
customer = pd.DataFrame()
campaigns_brut = pd.DataFrame()
campaigns_kpi = pd.DataFrame()
products = pd.DataFrame()
tickets = pd.DataFrame()
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
for directory_path in nb_compagnie:
df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_databases(directory_path, file_name = "target_information")
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut)
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
df_tickets_kpi["number_company"]=int(directory_path)
df_campaigns_brut["number_company"]=int(directory_path)
df_campaigns_kpi["number_company"]=int(directory_path)
df_customerplus_clean["number_company"]=int(directory_path)
df_target_information["number_company"]=int(directory_path)
# Traitement des index
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
# Concaténation
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
return customer, campaigns_kpi, campaigns_brut, tickets, products
def save_file_s3(File_name, type_of_activity):
FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + 'png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as file_out:
plt.savefig(file_out)
def outlier_detection(tickets, company_list, show_diagram=False):
outlier_list = list()
for company in company_list:
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
total_amount_share_index = total_amount_share.set_index('customer_id')
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
print('df circulaire : ', df_circulaire.head())
top = df_circulaire[:1]
print('top : ', top)
outlier_list.append(top.index[0])
rest = df_circulaire[1:]
rest_sum = rest.sum()
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
if show_diagram:
plt.figure(figsize=(3, 3))
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
plt.axis('equal')
plt.title(f'Répartition des montants totaux pour la compagnie {company}')
plt.show()
return outlier_list
def valid_customer_detection(products):
products_valid = products[products['purchase_date']>="2021-05-01"]
consumer_valid = products_valid['customer_id'].to_list()
return consumer_valid
def remove_elements(lst, elements_to_remove):
return ''.join([x for x in lst if x not in elements_to_remove])
def keep_elements(lst, elements_to_remove):
return ''.join([x for x in lst if x in elements_to_remove])
def compute_nb_clients(customer, type_of_activity):
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
plt.xlabel('Company')
plt.ylabel("Number of clients (thousands)")
plt.title(f"Number of clients for {type_of_activity}")
plt.show()
save_file_s3("nb_clients_", type_of_activity)
def maximum_price_paid(customer, type_of_activity):
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
plt.xlabel('Company')
plt.ylabel("Maximal price of a ticket Prix")
plt.title(f"Maximal price of a ticket for {type_of_activity}")
plt.show()
save_file_s3("Maximal_price_", type_of_activity)
def mailing_consent(customer, type_of_activity):
df_graph = customer.groupby(["number_company", "already_purchased"])["opt_in"].mean().reset_index()
fig, ax = plt.subplots(figsize=(10, 6))
categories = df_graph["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
for label in df_graph["already_purchased"].unique():
label_data = df_graph[df_graph['already_purchased'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
label_printed = "purchased" if label else "no purchase"
ax.bar(bar_positions, values, bar_width, label=label_printed)
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
ax.set_xlabel('Company')
ax.set_ylabel('Consent of mailing (%)')
ax.set_title(f'Consent of mailing for {type_of_activity}')
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
plt.show()
save_file_s3("mailing_consent_", type_of_activity)
def gender_bar(customer, type_of_activity):
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
plt.bar(company_genders["number_company"], company_genders["gender_female"],
bottom = company_genders["gender_male"], label = "Femme")
plt.bar(company_genders["number_company"], company_genders["gender_other"],
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
plt.xlabel('Company')
plt.ylabel("Gender")
plt.title(f"Gender of Customer for {type_of_activity}")
plt.legend()
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
plt.show()
save_file_s3("gender_bar_", type_of_activity)
def country_bar(customer, type_of_activity):
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
plt.xlabel('Company')
plt.ylabel("Share of French Customer")
plt.title(f"Share of French Customer for {type_of_activity}")
plt.show()
save_file_s3("country_bar_", type_of_activity)
def lazy_customer_plot(campaigns_kpi, type_of_activity):
company_lazy_customers = campaigns_kpi.groupby("number_company")["no_campaign_opened"].mean().reset_index()
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["no_campaign_opened"])
plt.xlabel('Company')
plt.ylabel("Share of Customers who did not open mail")
plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
plt.show()
save_file_s3("lazy_customer_", type_of_activity)
def campaigns_effectiveness(customer, type_of_activity):
customer["already_purchased"] = customer["purchase_count"]>0
nb_customers_purchasing = customer_sport[customer["already_purchased"]].groupby(["number_company","already_purchased"])["customer_id"].count().reset_index()
nb_customers_no_purchase = customer_sport[~customer["already_purchased"]].groupby(["number_company","already_purchased"])["customer_id"].count().reset_index()
plt.bar(nb_customers_purchasing["number_company"], nb_customers_purchasing["customer_id"]/1000, label = "has purchased")
plt.bar(nb_customers_no_purchase["number_company"], nb_customers_no_purchase["customer_id"]/1000,
bottom = nb_customers_purchasing["customer_id"]/1000, label = "has not purchased")
plt.xlabel('Company')
plt.ylabel("Number of Customers (thousands)")
plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
plt.legend()
plt.show()
save_file_s3("campaigns_effectiveness_", type_of_activity)
def sale_dynamics(products, campaigns_brut, type_of_activity):
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
purchases_graph = nb_purchases_graph
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
plt.bar(purchases_graph_used_0["purchase_date_month"], purchases_graph_used_0["nb_purchases"], width=12, label = "Nouveau client")
plt.bar(purchases_graph_used_0["purchase_date_month"], purchases_graph_used_1["nb_purchases"],
bottom = purchases_graph_used_0["nb_purchases"], width=12, label = "Ancien client")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
plt.xlabel('Month')
plt.ylabel("Number of Sales")
plt.title(f"Number of Sales for {type_of_activity}")
plt.legend()
plt.show()
save_file_s3("sale_dynamics_", type_of_activity)
def tickets_internet(tickets, type_of_activity):
nb_tickets_internet = products_purchased_reduced_spectacle.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["Share_ticket_internet"])
plt.xlabel('Company')
plt.ylabel("Share of Tickets Bought Online")
plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
plt.show()
save_file_s3("tickets_internet_", type_of_activity)
def box_plot_price_tickets(tickets, type_of_activity):
price_tickets = tickets[(tickets['total_amount'] > 0)]
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
plt.title(f"Box plot of price tickets for {type_of_activity}")
plt.show()
save_file_s3("box_plot_price_tickets_", type_of_activity)