import pandas as pd import os import s3fs import io import warnings from datetime import date, timedelta, datetime import numpy as np import matplotlib.pyplot as plt import matplotlib.dates as mdates import seaborn as sns def load_files(nb_compagnie): customer = pd.DataFrame() campaigns_brut = pd.DataFrame() campaigns_kpi = pd.DataFrame() products = pd.DataFrame() tickets = pd.DataFrame() # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle for directory_path in nb_compagnie: df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned") df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date']) df_target_information = display_databases(directory_path, file_name = "target_information") df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut) df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced) df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0) # creation de la colonne Number compagnie, qui permettra d'agréger les résultats df_tickets_kpi["number_company"]=int(directory_path) df_campaigns_brut["number_company"]=int(directory_path) df_campaigns_kpi["number_company"]=int(directory_path) df_customerplus_clean["number_company"]=int(directory_path) df_target_information["number_company"]=int(directory_path) # Traitement des index df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str') df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str') df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str') df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str') # Concaténation customer = pd.concat([customer, df_customerplus_clean], ignore_index=True) campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True) campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True) products = pd.concat([products, df_products_purchased_reduced], ignore_index=True) return customer, campaigns_kpi, campaigns_brut, tickets, products def save_file_s3(File_name, type_of_activity): image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png') image_buffer.seek(0) FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/" FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png' with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: s3_file.write(image_buffer.read()) plt.close() def outlier_detection(tickets, company_list, show_diagram=False): outlier_list = list() for company in company_list: total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index() total_amount_share['CA'] = total_amount_share['total_amount'].sum() total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA'] total_amount_share_index = total_amount_share.set_index('customer_id') df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False) #print('df circulaire : ', df_circulaire.head()) top = df_circulaire[:1] #print('top : ', top) outlier_list.append(top.index[0]) rest = df_circulaire[1:] rest_sum = rest.sum() new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])]) if show_diagram: plt.figure(figsize=(3, 3)) plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5) plt.axis('equal') plt.title(f'Répartition des montants totaux pour la compagnie {company}') plt.show() return outlier_list def valid_customer_detection(products, campaigns_brut): products_valid = products[products['purchase_date']>="2021-05-01"] consumer_valid_product = products_valid['customer_id'].to_list() campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"] consumer_valid_campaigns = campaigns_valid['customer_id'].to_list() consumer_valid = consumer_valid_product + consumer_valid_campaigns return consumer_valid def identify_purchase_during_target_periode(products): products_target_period = products[(products['purchase_date']>="2022-11-01") & (products['purchase_date']<="2023-11-01")] customer_target_period = products_target_period['customer_id'].to_list() return customer_target_period def remove_elements(lst, elements_to_remove): return ''.join([x for x in lst if x not in elements_to_remove]) def compute_nb_clients(customer, type_of_activity): company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index() plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000) plt.xlabel('Company') plt.ylabel("Number of clients (thousands)") plt.title(f"Number of clients for {type_of_activity}") plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]]) plt.show() save_file_s3("nb_clients_", type_of_activity) def maximum_price_paid(customer, type_of_activity): company_max_price = customer.groupby("number_company")["max_price"].max().reset_index() plt.bar(company_max_price["number_company"], company_max_price["max_price"]) plt.xlabel('Company') plt.ylabel("Maximal price of a ticket Prix") plt.title(f"Maximal price of a ticket for {type_of_activity}") plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]]) plt.show() save_file_s3("Maximal_price_", type_of_activity) def mailing_consent(customer, type_of_activity): mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index() plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"]) plt.xlabel('Company') plt.ylabel('Consent') plt.title(f'Consent of mailing for {type_of_activity}') plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]]) plt.show() save_file_s3("mailing_consent_", type_of_activity) def mailing_consent_by_target(customer): df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index() # Création du barplot groupé fig, ax = plt.subplots(figsize=(10, 6)) categories = df_graph["number_company"].unique() bar_width = 0.35 bar_positions = np.arange(len(categories)) # Grouper les données par label et créer les barres groupées for label in df_graph["has_purchased_target_period"].unique(): label_data = df_graph[df_graph['has_purchased_target_period'] == label] values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories] label_printed = "purchased" if label else "no purchase" ax.bar(bar_positions, values, bar_width, label=label_printed) # Mise à jour des positions des barres pour le prochain groupe bar_positions = [pos + bar_width for pos in bar_positions] # Ajout des étiquettes, de la légende, etc. ax.set_xlabel('Company') ax.set_ylabel('Consent') ax.set_title(f'Consent of mailing according to target for {type_of_activity}') ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) ax.set_xticklabels(categories) ax.legend() # Affichage du plot plt.show() save_file_s3("mailing_consent_target_", type_of_activity) def gender_bar(customer, type_of_activity): company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index() # Création du barplot plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme") plt.bar(company_genders["number_company"], company_genders["gender_female"], bottom = company_genders["gender_male"], label = "Femme") plt.bar(company_genders["number_company"], company_genders["gender_other"], bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu") plt.xlabel('Company') plt.ylabel("Gender") plt.title(f"Gender of Customer for {type_of_activity}") plt.legend() plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]]) plt.show() save_file_s3("gender_bar_", type_of_activity) def country_bar(customer, type_of_activity): company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index() plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"]) plt.xlabel('Company') plt.ylabel("Share of French Customer") plt.title(f"Share of French Customer for {type_of_activity}") plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]]) plt.show() save_file_s3("country_bar_", type_of_activity) def lazy_customer_plot(campaigns_kpi, type_of_activity): company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index() plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"]) plt.xlabel('Company') plt.ylabel("Share of Customers who did not open mail") plt.title(f"Share of Customers who did not open mail for {type_of_activity}") plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]]) plt.show() save_file_s3("lazy_customer_", type_of_activity) def campaigns_effectiveness(customer, type_of_activity): campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index() plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"]) plt.xlabel('Company') plt.ylabel("Number of Customers (thousands)") plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}") plt.legend() plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]]) plt.show() save_file_s3("campaigns_effectiveness_", type_of_activity) def sale_dynamics(products, campaigns_brut, type_of_activity): purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index() purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True) purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event']) purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m')) # Mois du premier mails first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index() first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True) first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception']) first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m')) # Fusion known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer') # Mois à partir duquel le client est considere comme connu known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601') # Nombre de commande par mois purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner') purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1) purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m')) purchases_count = purchases_count[purchases_count['customer_id'] != 1] # Nombre de commande par mois par type de client nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index() nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True) nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index() nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True) # Graphique en nombre de commande purchases_graph = nb_purchases_graph purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)] purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False] purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True] merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old")) plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client") plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client") # commande pr afficher slt plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y')) plt.xlabel('Month') plt.ylabel("Number of Sales") plt.title(f"Number of Sales for {type_of_activity}") plt.legend() plt.show() save_file_s3("sale_dynamics_", type_of_activity) def tickets_internet(tickets, type_of_activity): nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index() nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"] plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["Share_ticket_internet"]) plt.xlabel('Company') plt.ylabel("Share of Tickets Bought Online") plt.title(f"Share of Tickets Bought Online for {type_of_activity}") plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]]) plt.show() save_file_s3("tickets_internet_", type_of_activity) def box_plot_price_tickets(tickets, type_of_activity): price_tickets = tickets[(tickets['total_amount'] > 0)] sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True) plt.title(f"Box plot of price tickets for {type_of_activity}") plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]]) plt.show() save_file_s3("box_plot_price_tickets_", type_of_activity)