From ea3dcbb015d6f065608e443234a58600fdc5d851 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Tue, 2 Apr 2024 21:12:07 +0000 Subject: [PATCH] =?UTF-8?q?Am=C3=A9lioration=20graphique=20lazy=20+=20meil?= =?UTF-8?q?leur=20cadrage=20+=20enlever=20titre?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 4_Descriptive_Statistics.py | 6 +++ utils_stat_desc.py | 90 ++++++++++++++++++++++--------------- 2 files changed, 60 insertions(+), 36 deletions(-) diff --git a/4_Descriptive_Statistics.py b/4_Descriptive_Statistics.py index 45af7ee..4b0a894 100644 --- a/4_Descriptive_Statistics.py +++ b/4_Descriptive_Statistics.py @@ -5,6 +5,12 @@ import io import s3fs import re import warnings +from datetime import date, timedelta, datetime +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +import seaborn as sns + + # Ignore warning warnings.filterwarnings('ignore') diff --git a/utils_stat_desc.py b/utils_stat_desc.py index 471fe19..41aa4e5 100644 --- a/utils_stat_desc.py +++ b/utils_stat_desc.py @@ -1,13 +1,3 @@ -import pandas as pd -import os -import s3fs -import io -import warnings -from datetime import date, timedelta, datetime -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.dates as mdates -import seaborn as sns def load_files(nb_compagnie): @@ -84,7 +74,7 @@ def remove_outlier_total_amount(tickets): def save_file_s3(File_name, type_of_activity): image_buffer = io.BytesIO() - plt.savefig(image_buffer, format='png') + plt.savefig(image_buffer, format='png', pad_inches=1, bbox_inches="tight") image_buffer.seek(0) FILE_PATH = f"projet-bdc2324-team1/2_Output/2_0_Descriptive_Statistics/{type_of_activity}/" FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png' @@ -118,7 +108,7 @@ def outlier_detection(tickets, company_list, show_diagram=False): plt.figure(figsize=(3, 3)) plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5) plt.axis('equal') - plt.title(f'Répartition des montants totaux pour la compagnie {company}') + # plt.title(f'Répartition des montants totaux pour la compagnie {company}') plt.show() return outlier_list @@ -147,11 +137,11 @@ def remove_elements(lst, elements_to_remove): def compute_nb_clients(customer, type_of_activity): company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index() + plt.figure(figsize=(4,3)) plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000) - - plt.xlabel('Company') + plt.xlabel('Company Number') plt.ylabel("Number of clients (thousands)") - plt.title(f"Number of clients Across {type_of_activity} Companies") + # plt.title(f"Number of clients Across {type_of_activity} Companies") plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]]) plt.show() save_file_s3("nb_clients_", type_of_activity) @@ -163,7 +153,7 @@ def maximum_price_paid(customer, type_of_activity): plt.xlabel('Company Number') plt.ylabel("Maximal price of a ticket Prix") - plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies") + # plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies") plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]]) plt.show() save_file_s3("Maximal_price_", type_of_activity) @@ -176,7 +166,7 @@ def target_proportion(customer, type_of_activity): plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"]) plt.xlabel('Company Number') plt.ylabel('Share (%)') - plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies') + # plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies') plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]]) plt.show() save_file_s3("share_target_", type_of_activity) @@ -189,7 +179,7 @@ def mailing_consent(customer, type_of_activity): plt.xlabel('Company Number') plt.ylabel('Mailing Consent (%)') - plt.title(f'Consent of mailing Across {type_of_activity} Companies') + # plt.title(f'Consent of mailing Across {type_of_activity} Companies') plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]]) plt.show() save_file_s3("mailing_consent_", type_of_activity) @@ -198,7 +188,7 @@ def mailing_consent(customer, type_of_activity): def mailing_consent_by_target(customer): df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index() # Création du barplot groupé - fig, ax = plt.subplots(figsize=(10, 6)) + fig, ax = plt.subplots(figsize=(5, 3)) categories = df_graph["number_company"].unique() bar_width = 0.35 @@ -218,7 +208,7 @@ def mailing_consent_by_target(customer): # Ajout des étiquettes, de la légende, etc. ax.set_xlabel('Company Number') ax.set_ylabel('Mailing Consent (%)') - ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies') + # ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies') ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) ax.set_xticklabels(categories) ax.legend() @@ -236,6 +226,7 @@ def gender_bar(customer, type_of_activity): company_genders["gender_other"] *= 100 # Création du barplot + plt.figure(figsize=(4,3)) plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male") plt.bar(company_genders["number_company"], company_genders["gender_female"], bottom = company_genders["gender_male"], label = "Female") @@ -244,7 +235,7 @@ def gender_bar(customer, type_of_activity): plt.xlabel('Company Number') plt.ylabel("Frequency (%)") - plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies") + # plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies") plt.legend() plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]]) plt.show() @@ -254,23 +245,46 @@ def gender_bar(customer, type_of_activity): def country_bar(customer, type_of_activity): company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index() company_country_fr["country_fr"] *= 100 + plt.figure(figsize=(4,3)) plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"]) - plt.xlabel('Company Number') plt.ylabel("Share of French Customer (%)") - plt.title(f"Share of French Customer Across {type_of_activity} Companies") + # plt.title(f"Share of French Customer Across {type_of_activity} Companies") plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]]) plt.show() save_file_s3("country_bar_", type_of_activity) def lazy_customer_plot(campaigns_kpi, type_of_activity): - company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index() - plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"]) + company_lazy_customers = campaigns_kpi.groupby("number_company")[["nb_campaigns", "taux_ouverture_mail"]].mean().reset_index() + company_lazy_customers["taux_ouverture_mail"] *= 100 + + # Initialize the figure + fig, ax1 = plt.subplots(figsize=(6, 3)) + width = 0.4 + x = range(len(company_lazy_customers)) + + # Plot the bars for "nb_campaigns" on the first y-axis + ax1.bar([i - width/2 for i in x], company_lazy_customers['nb_campaigns'], width=width, align='center', label='Amount of Campaigns', color = 'steelblue') + + # Set labels and title for the first y-axis + ax1.set_ylabel('Number of Mails Received', color='steelblue') + ax1.tick_params(axis='y', labelcolor='steelblue') + + # Create another y-axis for "taux_ouverture_mail" + ax2 = ax1.twinx() + + # Plot the bars for "taux_ouverture_mail" on the second y-axis + ax2.bar([i + width/2 for i in x], company_lazy_customers['taux_ouverture_mail'], width=width, align='center', label='Open Mail Rate', color = 'darkorange') + + # Set labels and title for the second y-axis + ax2.set_ylabel('Open Mail Rate (%)', color='darkorange') + ax2.tick_params(axis='y', labelcolor='darkorange') + + # Set x-axis ticks and labels + ax1.set_xticks(x) + ax1.set_xticklabels(company_lazy_customers['number_company']) - plt.xlabel('Company Number') - plt.title(f"Share of Customers who did not Open Mail Across {type_of_activity} Companies") - plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]]) plt.show() save_file_s3("lazy_customer_", type_of_activity) @@ -279,7 +293,7 @@ def campaigns_effectiveness(customer, type_of_activity): campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index() - fig, ax = plt.subplots(figsize=(10, 6)) + fig, ax = plt.subplots(figsize=(5, 3)) categories = campaigns_effectiveness["number_company"].unique() bar_width = 0.35 @@ -299,7 +313,7 @@ def campaigns_effectiveness(customer, type_of_activity): # Ajout des étiquettes, de la légende, etc. ax.set_xlabel('Company Number') ax.set_ylabel('Share of Consent (%)') - ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)") + # ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)") ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) ax.set_xticklabels(categories) ax.legend() @@ -349,7 +363,8 @@ def sale_dynamics(products, campaigns_brut, type_of_activity): merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old")) - + plt.figure(figsize=(5.5,4)) + plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers") plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers") @@ -360,7 +375,7 @@ def sale_dynamics(products, campaigns_brut, type_of_activity): plt.xlabel('Month') plt.ylabel("Number of Sales") - plt.title(f"Number of Sales Across {type_of_activity} Companies") + # plt.title(f"Number of Sales Across {type_of_activity} Companies") plt.legend() plt.show() save_file_s3("sale_dynamics_", type_of_activity) @@ -373,7 +388,7 @@ def tickets_internet(tickets, type_of_activity): plt.xlabel('Company Number') plt.ylabel("Share of Purchases Bought Online (%)") - plt.title(f"Share of Online Purchases Across {type_of_activity} Companies") + # plt.title(f"Share of Online Purchases Across {type_of_activity} Companies") plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]]) plt.show() save_file_s3("tickets_internet_", type_of_activity) @@ -389,7 +404,7 @@ def already_bought_online(tickets, type_of_activity): plt.xlabel('Company Number') plt.ylabel("Share of Customer who Bought Online at least once (%)") - plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies") + # plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies") plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]]) plt.show() save_file_s3("First_buy_internet_", type_of_activity) @@ -397,8 +412,11 @@ def already_bought_online(tickets, type_of_activity): def box_plot_price_tickets(tickets, type_of_activity): price_tickets = tickets[(tickets['total_amount'] > 0)] + plt.figure(figsize=(4,3)) sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True) - plt.title(f"Box plot of price tickets Across {type_of_activity} Companies") + # plt.title(f"Box plot of price tickets Across {type_of_activity} Companies") + plt.xlabel('Company Number') + plt.ylabel("Total Amount Spent") plt.show() save_file_s3("box_plot_price_tickets_", type_of_activity) @@ -417,7 +435,7 @@ def target_description(targets, type_of_activity): plot = describe_target.plot.bar() # Adding a title - plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies") + # plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies") # Adding labels for x and y axes plot.set_xlabel("Company Number")