diff --git a/0_4_Generate_stat_desc.py b/0_4_Generate_stat_desc.py new file mode 100644 index 0000000..6dc3695 --- /dev/null +++ b/0_4_Generate_stat_desc.py @@ -0,0 +1,68 @@ +import pandas as pd +import numpy as np +import os +import io +import s3fs +import re +import warnings + +# Ignore warning +warnings.filterwarnings('ignore') + +exec(open('0_KPI_functions.py').read()) +exec(open('utils_stat_desc.py').read()) + +# Create filesystem object +S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] +fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) + +companies = {'musee' : ['1', '2', '3', '4'], # , '101' + 'sport': ['5'], + 'musique' : ['10', '11', '12', '13', '14']} + + +type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') +list_of_comp = companies[type_of_activity] + +# Load files +customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp) + +# Identify anonymous customer for each company and remove them from our datasets +outlier_list = outlier_detection(tickets, list_of_comp) + +# Identify valid customer (customer who bought tickets after starting date or received mails after starting date) +customer_valid_list = valid_customer_detection(products, campaigns_brut) + +databases = [customer, campaigns_kpi, campaigns_brut, tickets, products] + +for dataset in databases: + dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier + dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer + #print(f'shape of {dataset} : ', dataset.shape) + +# Identify customer who bought during the period of y +customer_target_period = identify_purchase_during_target_periode(products) +customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0) + +# Generate graph and automatically saved them in the bucket +compute_nb_clients(customer, type_of_activity) + +maximum_price_paid(customer, type_of_activity) + +mailing_consent(customer, type_of_activity) + +mailing_consent_by_target(customer) + +gender_bar(customer, type_of_activity) + +country_bar(customer, type_of_activity) + +lazy_customer_plot(campaigns_kpi, type_of_activity) + +#campaigns_effectiveness(customer, type_of_activity) + +sale_dynamics(products, campaigns_brut, type_of_activity) + +tickets_internet(tickets, type_of_activity) + +box_plot_price_tickets(tickets, type_of_activity) \ No newline at end of file diff --git a/utils_stat_desc.py b/utils_stat_desc.py new file mode 100644 index 0000000..f8d33ad --- /dev/null +++ b/utils_stat_desc.py @@ -0,0 +1,327 @@ +import pandas as pd +import os +import s3fs +import io +import warnings +from datetime import date, timedelta, datetime +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +import seaborn as sns + + +def load_files(nb_compagnie): + customer = pd.DataFrame() + campaigns_brut = pd.DataFrame() + campaigns_kpi = pd.DataFrame() + products = pd.DataFrame() + tickets = pd.DataFrame() + + # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle + for directory_path in nb_compagnie: + df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned") + df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) + df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date']) + df_target_information = display_databases(directory_path, file_name = "target_information") + + df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut) + df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced) + df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0) + + + # creation de la colonne Number compagnie, qui permettra d'agréger les résultats + df_tickets_kpi["number_company"]=int(directory_path) + df_campaigns_brut["number_company"]=int(directory_path) + df_campaigns_kpi["number_company"]=int(directory_path) + df_customerplus_clean["number_company"]=int(directory_path) + df_target_information["number_company"]=int(directory_path) + + # Traitement des index + df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str') + df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str') + df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') + df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str') + df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str') + + # Concaténation + customer = pd.concat([customer, df_customerplus_clean], ignore_index=True) + campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True) + campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) + tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True) + products = pd.concat([products, df_products_purchased_reduced], ignore_index=True) + + return customer, campaigns_kpi, campaigns_brut, tickets, products + + +def save_file_s3(File_name, type_of_activity): + image_buffer = io.BytesIO() + plt.savefig(image_buffer, format='png') + image_buffer.seek(0) + FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/" + FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png' + with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: + s3_file.write(image_buffer.read()) + plt.close() + + +def outlier_detection(tickets, company_list, show_diagram=False): + + outlier_list = list() + + for company in company_list: + total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index() + total_amount_share['CA'] = total_amount_share['total_amount'].sum() + total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA'] + + total_amount_share_index = total_amount_share.set_index('customer_id') + df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False) + #print('df circulaire : ', df_circulaire.head()) + top = df_circulaire[:1] + #print('top : ', top) + outlier_list.append(top.index[0]) + rest = df_circulaire[1:] + + rest_sum = rest.sum() + + new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])]) + + if show_diagram: + plt.figure(figsize=(3, 3)) + plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5) + plt.axis('equal') + plt.title(f'Répartition des montants totaux pour la compagnie {company}') + plt.show() + return outlier_list + + +def valid_customer_detection(products, campaigns_brut): + products_valid = products[products['purchase_date']>="2021-05-01"] + consumer_valid_product = products_valid['customer_id'].to_list() + + campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"] + consumer_valid_campaigns = campaigns_valid['customer_id'].to_list() + + consumer_valid = consumer_valid_product + consumer_valid_campaigns + return consumer_valid + + +def identify_purchase_during_target_periode(products): + products_target_period = products[(products['purchase_date']>="2022-11-01") + & (products['purchase_date']<="2023-11-01")] + customer_target_period = products_target_period['customer_id'].to_list() + return customer_target_period + + +def remove_elements(lst, elements_to_remove): + return ''.join([x for x in lst if x not in elements_to_remove]) + + +def compute_nb_clients(customer, type_of_activity): + company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index() + plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000) + + plt.xlabel('Company') + plt.ylabel("Number of clients (thousands)") + plt.title(f"Number of clients for {type_of_activity}") + plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]]) + plt.show() + save_file_s3("nb_clients_", type_of_activity) + + +def maximum_price_paid(customer, type_of_activity): + company_max_price = customer.groupby("number_company")["max_price"].max().reset_index() + plt.bar(company_max_price["number_company"], company_max_price["max_price"]) + + plt.xlabel('Company') + plt.ylabel("Maximal price of a ticket Prix") + plt.title(f"Maximal price of a ticket for {type_of_activity}") + plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]]) + plt.show() + save_file_s3("Maximal_price_", type_of_activity) + + +def mailing_consent(customer, type_of_activity): + mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index() + + plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"]) + + plt.xlabel('Company') + plt.ylabel('Consent') + plt.title(f'Consent of mailing for {type_of_activity}') + plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]]) + plt.show() + save_file_s3("mailing_consent_", type_of_activity) + + +def mailing_consent_by_target(customer): + df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index() + # Création du barplot groupé + fig, ax = plt.subplots(figsize=(10, 6)) + + categories = df_graph["number_company"].unique() + bar_width = 0.35 + bar_positions = np.arange(len(categories)) + + # Grouper les données par label et créer les barres groupées + for label in df_graph["has_purchased_target_period"].unique(): + label_data = df_graph[df_graph['has_purchased_target_period'] == label] + values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories] + + label_printed = "purchased" if label else "no purchase" + ax.bar(bar_positions, values, bar_width, label=label_printed) + + # Mise à jour des positions des barres pour le prochain groupe + bar_positions = [pos + bar_width for pos in bar_positions] + + # Ajout des étiquettes, de la légende, etc. + ax.set_xlabel('Company') + ax.set_ylabel('Consent') + ax.set_title(f'Consent of mailing according to target for {type_of_activity}') + ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) + ax.set_xticklabels(categories) + ax.legend() + + # Affichage du plot + plt.show() + save_file_s3("mailing_consent_target_", type_of_activity) + + +def gender_bar(customer, type_of_activity): + company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index() + + # Création du barplot + plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme") + plt.bar(company_genders["number_company"], company_genders["gender_female"], + bottom = company_genders["gender_male"], label = "Femme") + plt.bar(company_genders["number_company"], company_genders["gender_other"], + bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu") + + plt.xlabel('Company') + plt.ylabel("Gender") + plt.title(f"Gender of Customer for {type_of_activity}") + plt.legend() + plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]]) + plt.show() + save_file_s3("gender_bar_", type_of_activity) + + +def country_bar(customer, type_of_activity): + company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index() + plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"]) + + plt.xlabel('Company') + plt.ylabel("Share of French Customer") + plt.title(f"Share of French Customer for {type_of_activity}") + plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]]) + plt.show() + save_file_s3("country_bar_", type_of_activity) + + +def lazy_customer_plot(campaigns_kpi, type_of_activity): + company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index() + plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"]) + + plt.xlabel('Company') + plt.ylabel("Share of Customers who did not open mail") + plt.title(f"Share of Customers who did not open mail for {type_of_activity}") + plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]]) + plt.show() + save_file_s3("lazy_customer_", type_of_activity) + + +def campaigns_effectiveness(customer, type_of_activity): + + campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index() + + plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"]) + + plt.xlabel('Company') + plt.ylabel("Number of Customers (thousands)") + plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}") + plt.legend() + plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]]) + plt.show() + save_file_s3("campaigns_effectiveness_", type_of_activity) + + +def sale_dynamics(products, campaigns_brut, type_of_activity): + purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index() + purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True) + purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event']) + purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m')) + + # Mois du premier mails + first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index() + first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True) + first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception']) + first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m')) + + # Fusion + known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], + first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer') + + # Mois à partir duquel le client est considere comme connu + + known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601') + + # Nombre de commande par mois + purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner') + purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1) + purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m')) + purchases_count = purchases_count[purchases_count['customer_id'] != 1] + + # Nombre de commande par mois par type de client + nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index() + nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True) + + nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index() + nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True) + + # Graphique en nombre de commande + purchases_graph = nb_purchases_graph + + purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)] + purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False] + purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True] + + + merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old")) + + plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client") + plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], + bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client") + + + # commande pr afficher slt + plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y')) + + plt.xlabel('Month') + plt.ylabel("Number of Sales") + plt.title(f"Number of Sales for {type_of_activity}") + plt.legend() + plt.show() + save_file_s3("sale_dynamics_", type_of_activity) + + +def tickets_internet(tickets, type_of_activity): + nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index() + nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"] + + plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["Share_ticket_internet"]) + + plt.xlabel('Company') + plt.ylabel("Share of Tickets Bought Online") + plt.title(f"Share of Tickets Bought Online for {type_of_activity}") + plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]]) + plt.show() + save_file_s3("tickets_internet_", type_of_activity) + + +def box_plot_price_tickets(tickets, type_of_activity): + price_tickets = tickets[(tickets['total_amount'] > 0)] + sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True) + plt.title(f"Box plot of price tickets for {type_of_activity}") + plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]]) + plt.show() + save_file_s3("box_plot_price_tickets_", type_of_activity) +