diff --git a/Descriptive_statistics/generate_stat_desc.py b/Descriptive_statistics/generate_stat_desc.py new file mode 100644 index 0000000..0f09598 --- /dev/null +++ b/Descriptive_statistics/generate_stat_desc.py @@ -0,0 +1,58 @@ +import pandas as pd +import numpy as np +import os +import s3fs +import re +import warnings + +# Ignore warning +warnings.filterwarnings('ignore') + +exec(open('../0_KPI_functions.py').read()) +exec(open('plot.py').read()) + +# Create filesystem object +S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] +fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) + +companies = {'musee' : ['1', '2', '3', '4'], # , '101' + 'sport': ['5'], + 'musique' : ['10', '11', '12', '13', '14']} + + +type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') +list_of_comp = companies[type_of_activity] + +# Load files +customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp) + +# Identify anonymous customer for each company and remove them from our datasets +outlier_list = outlier_detection(tickets, list_of_comp) + +# Identify valid customer (customer who bought tickets after starting date) +customer_valid_list = valid_customer_detection(products) + +databases = [customer, campaigns_kpi, campaigns_brut, tickets, products] + +for dataset in databases: + dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier + dataset['customer_id'] = dataset['customer_id'].isin(customer_valid_list) # keep only valid customer + +# Generate graph and automatically saved them in the bucket +compute_nb_clients(customer, type_of_activity) + +maximum_price_paid(customer, type_of_activity) + +mailing_consent(customer, type_of_activity) + +gender_bar(customer, type_of_activity) + +country_bar(customer, type_of_activity) + +lazy_customer_plot(campaigns_kpi, type_of_activity) + +campaigns_effectiveness(customer, type_of_activity) + +sale_dynamics(products, campaigns_brut, type_of_activity) + +tickets_internet(tickets, type_of_activity) diff --git a/Descriptive_statistics/plot.py b/Descriptive_statistics/plot.py new file mode 100644 index 0000000..95ddf0e --- /dev/null +++ b/Descriptive_statistics/plot.py @@ -0,0 +1,290 @@ +import pandas as pd +import os +import s3fs +import warnings +from datetime import date, timedelta, datetime +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +import seaborn as sns + + +def load_files(nb_compagnie): + customer = pd.DataFrame() + campaigns_brut = pd.DataFrame() + campaigns_kpi = pd.DataFrame() + products = pd.DataFrame() + tickets = pd.DataFrame() + + # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle + for directory_path in nb_compagnie: + df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned") + df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) + df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date']) + df_target_information = display_databases(directory_path, file_name = "target_information") + + df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut) + df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced) + df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0) + + + # creation de la colonne Number compagnie, qui permettra d'agréger les résultats + df_tickets_kpi["number_company"]=int(directory_path) + df_campaigns_brut["number_company"]=int(directory_path) + df_campaigns_kpi["number_company"]=int(directory_path) + df_customerplus_clean["number_company"]=int(directory_path) + df_target_information["number_company"]=int(directory_path) + + # Traitement des index + df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str') + df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str') + df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') + df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str') + df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str') + + # Concaténation + customer = pd.concat([customer, df_customerplus_clean], ignore_index=True) + campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True) + campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) + tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True) + products = pd.concat([products, df_products_purchased_reduced], ignore_index=True) + + return customer, campaigns_kpi, campaigns_brut, tickets, products + + +def save_file_s3(File_name, type_of_activity): + FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/" + FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + 'png' + with fs.open(FILE_PATH_OUT_S3, 'wb') as file_out: + plt.savefig(file_out) + + +def outlier_detection(tickets, company_list, show_diagram=False): + + outlier_list = list() + + for company in company_list: + total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index() + total_amount_share['CA'] = total_amount_share['total_amount'].sum() + total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA'] + + total_amount_share_index = total_amount_share.set_index('customer_id') + df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False) + print('df circulaire : ', df_circulaire.head()) + top = df_circulaire[:1] + print('top : ', top) + outlier_list.append(top.index[0]) + rest = df_circulaire[1:] + + rest_sum = rest.sum() + + new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])]) + + if show_diagram: + plt.figure(figsize=(3, 3)) + plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5) + plt.axis('equal') + plt.title(f'Répartition des montants totaux pour la compagnie {company}') + plt.show() + return outlier_list + + +def valid_customer_detection(products): + products_valid = products[products['purchase_date']>="2021-05-01"] + consumer_valid = products_valid['customer_id'].to_list() + return consumer_valid + +def remove_elements(lst, elements_to_remove): + return ''.join([x for x in lst if x not in elements_to_remove]) + +def keep_elements(lst, elements_to_remove): + return ''.join([x for x in lst if x in elements_to_remove]) + + +def compute_nb_clients(customer, type_of_activity): + company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index() + plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000) + + plt.xlabel('Company') + plt.ylabel("Number of clients (thousands)") + plt.title(f"Number of clients for {type_of_activity}") + + plt.show() + save_file_s3("nb_clients_", type_of_activity) + + +def maximum_price_paid(customer, type_of_activity): + company_max_price = customer.groupby("number_company")["max_price"].max().reset_index() + plt.bar(company_max_price["number_company"], company_max_price["max_price"]) + + plt.xlabel('Company') + plt.ylabel("Maximal price of a ticket Prix") + plt.title(f"Maximal price of a ticket for {type_of_activity}") + + plt.show() + save_file_s3("Maximal_price_", type_of_activity) + + +def mailing_consent(customer, type_of_activity): + df_graph = customer.groupby(["number_company", "already_purchased"])["opt_in"].mean().reset_index() + fig, ax = plt.subplots(figsize=(10, 6)) + + categories = df_graph["number_company"].unique() + bar_width = 0.35 + bar_positions = np.arange(len(categories)) + + for label in df_graph["already_purchased"].unique(): + label_data = df_graph[df_graph['already_purchased'] == label] + values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories] + + label_printed = "purchased" if label else "no purchase" + ax.bar(bar_positions, values, bar_width, label=label_printed) + + bar_positions = [pos + bar_width for pos in bar_positions] + + # Ajout des étiquettes, de la légende, etc. + ax.set_xlabel('Company') + ax.set_ylabel('Consent of mailing (%)') + ax.set_title(f'Consent of mailing for {type_of_activity}') + ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) + ax.set_xticklabels(categories) + ax.legend() + + plt.show() + save_file_s3("mailing_consent_", type_of_activity) + + +def gender_bar(customer, type_of_activity): + company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index() + + plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme") + plt.bar(company_genders["number_company"], company_genders["gender_female"], + bottom = company_genders["gender_male"], label = "Femme") + plt.bar(company_genders["number_company"], company_genders["gender_other"], + bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu") + + plt.xlabel('Company') + plt.ylabel("Gender") + plt.title(f"Gender of Customer for {type_of_activity}") + plt.legend() + + plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]]) + + plt.show() + save_file_s3("gender_bar_", type_of_activity) + + +def country_bar(customer, type_of_activity): + company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index() + plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"]) + + plt.xlabel('Company') + plt.ylabel("Share of French Customer") + plt.title(f"Share of French Customer for {type_of_activity}") + + plt.show() + save_file_s3("country_bar_", type_of_activity) + + +def lazy_customer_plot(campaigns_kpi, type_of_activity): + company_lazy_customers = campaigns_kpi.groupby("number_company")["no_campaign_opened"].mean().reset_index() + plt.bar(company_lazy_customers["number_company"], company_lazy_customers["no_campaign_opened"]) + + plt.xlabel('Company') + plt.ylabel("Share of Customers who did not open mail") + plt.title(f"Share of Customers who did not open mail for {type_of_activity}") + + plt.show() + save_file_s3("lazy_customer_", type_of_activity) + + +def campaigns_effectiveness(customer, type_of_activity): + + customer["already_purchased"] = customer["purchase_count"]>0 + + nb_customers_purchasing = customer_sport[customer["already_purchased"]].groupby(["number_company","already_purchased"])["customer_id"].count().reset_index() + nb_customers_no_purchase = customer_sport[~customer["already_purchased"]].groupby(["number_company","already_purchased"])["customer_id"].count().reset_index() + + plt.bar(nb_customers_purchasing["number_company"], nb_customers_purchasing["customer_id"]/1000, label = "has purchased") + plt.bar(nb_customers_no_purchase["number_company"], nb_customers_no_purchase["customer_id"]/1000, + bottom = nb_customers_purchasing["customer_id"]/1000, label = "has not purchased") + + plt.xlabel('Company') + plt.ylabel("Number of Customers (thousands)") + plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}") + plt.legend() + plt.show() + save_file_s3("campaigns_effectiveness_", type_of_activity) + + +def sale_dynamics(products, campaigns_brut, type_of_activity): + purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index() + purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True) + purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event']) + purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m')) + + first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index() + first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True) + first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception']) + first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m')) + + known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], + first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer') + + known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601') + + purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner') + purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1) + purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m')) + purchases_count = purchases_count[purchases_count['customer_id'] != 1] + + nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index() + nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True) + + nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index() + nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True) + + purchases_graph = nb_purchases_graph + + purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)] + purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False] + purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True] + + plt.bar(purchases_graph_used_0["purchase_date_month"], purchases_graph_used_0["nb_purchases"], width=12, label = "Nouveau client") + plt.bar(purchases_graph_used_0["purchase_date_month"], purchases_graph_used_1["nb_purchases"], + bottom = purchases_graph_used_0["nb_purchases"], width=12, label = "Ancien client") + + plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y')) + + plt.xlabel('Month') + plt.ylabel("Number of Sales") + plt.title(f"Number of Sales for {type_of_activity}") + plt.legend() + + plt.show() + save_file_s3("sale_dynamics_", type_of_activity) + + +def tickets_internet(tickets, type_of_activity): + nb_tickets_internet = products_purchased_reduced_spectacle.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index() + nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"] + + plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["Share_ticket_internet"]) + + plt.xlabel('Company') + plt.ylabel("Share of Tickets Bought Online") + plt.title(f"Share of Tickets Bought Online for {type_of_activity}") + + plt.show() + save_file_s3("tickets_internet_", type_of_activity) + + +def box_plot_price_tickets(tickets, type_of_activity): + price_tickets = tickets[(tickets['total_amount'] > 0)] + sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True) + plt.title(f"Box plot of price tickets for {type_of_activity}") + + plt.show() + save_file_s3("box_plot_price_tickets_", type_of_activity) + +