fix errors

This commit is contained in:
Alexis REVELLE 2024-03-14 23:02:50 +00:00
parent db6eaaaa8d
commit 15c102682a
3 changed files with 121 additions and 58 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,7 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import os import os
import io
import s3fs import s3fs
import re import re
import warnings import warnings
@ -16,7 +17,7 @@ S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
companies = {'musee' : ['1', '2', '3', '4'], # , '101' companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5', '6'], 'sport': ['5'],
'musique' : ['10', '11', '12', '13', '14']} 'musique' : ['10', '11', '12', '13', '14']}
@ -32,17 +33,17 @@ outlier_list = outlier_detection(tickets, list_of_comp)
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date) # Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
customer_valid_list = valid_customer_detection(products, campaigns_brut) customer_valid_list = valid_customer_detection(products, campaigns_brut)
# Identify customer who bought during the period of y
consumer_target_period = identify_purchase_during_target_periode(products)
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products] databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
for dataset in databases: for dataset in databases:
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
dataset['customer_id'] = dataset['customer_id'].isin(customer_valid_list) # keep only valid customer dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
dataset['has_purchased_target_period'] = np.where(dataset['customer_id'].isin(customer_valid_list), 1, 0)
#print(f'shape of {dataset} : ', dataset.shape) #print(f'shape of {dataset} : ', dataset.shape)
# Identify customer who bought during the period of y
customer_target_period = identify_purchase_during_target_periode(products)
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
# Generate graph and automatically saved them in the bucket # Generate graph and automatically saved them in the bucket
compute_nb_clients(customer, type_of_activity) compute_nb_clients(customer, type_of_activity)
@ -52,16 +53,16 @@ mailing_consent(customer, type_of_activity)
mailing_consent_by_target(customer) mailing_consent_by_target(customer)
#gender_bar(customer, type_of_activity) gender_bar(customer, type_of_activity)
#country_bar(customer, type_of_activity) country_bar(customer, type_of_activity)
#lazy_customer_plot(campaigns_kpi, type_of_activity) lazy_customer_plot(campaigns_kpi, type_of_activity)
#campaigns_effectiveness(customer, type_of_activity) #campaigns_effectiveness(customer, type_of_activity)
#sale_dynamics(products, campaigns_brut, type_of_activity) sale_dynamics(products, campaigns_brut, type_of_activity)
#tickets_internet(tickets, type_of_activity) tickets_internet(tickets, type_of_activity)
#box_plot_price_tickets(tickets, type_of_activity) box_plot_price_tickets(tickets, type_of_activity)

View File

@ -1,6 +1,7 @@
import pandas as pd import pandas as pd
import os import os
import s3fs import s3fs
import io
import warnings import warnings
from datetime import date, timedelta, datetime from datetime import date, timedelta, datetime
import numpy as np import numpy as np
@ -53,10 +54,14 @@ def load_files(nb_compagnie):
def save_file_s3(File_name, type_of_activity): def save_file_s3(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/" FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png' FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as file_out: with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
plt.savefig(file_out) s3_file.write(image_buffer.read())
plt.close()
def outlier_detection(tickets, company_list, show_diagram=False): def outlier_detection(tickets, company_list, show_diagram=False):
@ -72,7 +77,7 @@ def outlier_detection(tickets, company_list, show_diagram=False):
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False) df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
#print('df circulaire : ', df_circulaire.head()) #print('df circulaire : ', df_circulaire.head())
top = df_circulaire[:1] top = df_circulaire[:1]
print('top : ', top) #print('top : ', top)
outlier_list.append(top.index[0]) outlier_list.append(top.index[0])
rest = df_circulaire[1:] rest = df_circulaire[1:]
@ -101,9 +106,10 @@ def valid_customer_detection(products, campaigns_brut):
def identify_purchase_during_target_periode(products): def identify_purchase_during_target_periode(products):
products_target_period = products[(products['purchase_date']>="2022-11-01") & (products['purchase_date']<="2023-11-01")] products_target_period = products[(products['purchase_date']>="2022-11-01")
consumer_target_period = products_target_period['customer_id'].to_list() & (products['purchase_date']<="2023-11-01")]
return consumer_target_period customer_target_period = products_target_period['customer_id'].to_list()
return customer_target_period
def remove_elements(lst, elements_to_remove): def remove_elements(lst, elements_to_remove):
@ -117,7 +123,7 @@ def compute_nb_clients(customer, type_of_activity):
plt.xlabel('Company') plt.xlabel('Company')
plt.ylabel("Number of clients (thousands)") plt.ylabel("Number of clients (thousands)")
plt.title(f"Number of clients for {type_of_activity}") plt.title(f"Number of clients for {type_of_activity}")
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
plt.show() plt.show()
save_file_s3("nb_clients_", type_of_activity) save_file_s3("nb_clients_", type_of_activity)
@ -129,7 +135,7 @@ def maximum_price_paid(customer, type_of_activity):
plt.xlabel('Company') plt.xlabel('Company')
plt.ylabel("Maximal price of a ticket Prix") plt.ylabel("Maximal price of a ticket Prix")
plt.title(f"Maximal price of a ticket for {type_of_activity}") plt.title(f"Maximal price of a ticket for {type_of_activity}")
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
plt.show() plt.show()
save_file_s3("Maximal_price_", type_of_activity) save_file_s3("Maximal_price_", type_of_activity)
@ -140,9 +146,9 @@ def mailing_consent(customer, type_of_activity):
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"]) plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
plt.xlabel('Company') plt.xlabel('Company')
plt.ylabel('Company') plt.ylabel('Consent')
plt.title(f'Consent of mailing for {type_of_activity}') plt.title(f'Consent of mailing for {type_of_activity}')
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
plt.show() plt.show()
save_file_s3("mailing_consent_", type_of_activity) save_file_s3("mailing_consent_", type_of_activity)
@ -169,7 +175,7 @@ def mailing_consent_by_target(customer):
# Ajout des étiquettes, de la légende, etc. # Ajout des étiquettes, de la légende, etc.
ax.set_xlabel('Company') ax.set_xlabel('Company')
ax.set_ylabel('Company') ax.set_ylabel('Consent')
ax.set_title(f'Consent of mailing according to target for {type_of_activity}') ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories) ax.set_xticklabels(categories)
@ -183,6 +189,7 @@ def mailing_consent_by_target(customer):
def gender_bar(customer, type_of_activity): def gender_bar(customer, type_of_activity):
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index() company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
# Création du barplot
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme") plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
plt.bar(company_genders["number_company"], company_genders["gender_female"], plt.bar(company_genders["number_company"], company_genders["gender_female"],
bottom = company_genders["gender_male"], label = "Femme") bottom = company_genders["gender_male"], label = "Femme")
@ -193,12 +200,10 @@ def gender_bar(customer, type_of_activity):
plt.ylabel("Gender") plt.ylabel("Gender")
plt.title(f"Gender of Customer for {type_of_activity}") plt.title(f"Gender of Customer for {type_of_activity}")
plt.legend() plt.legend()
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]]) plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
plt.show() plt.show()
save_file_s3("gender_bar_", type_of_activity) save_file_s3("gender_bar_", type_of_activity)
def country_bar(customer, type_of_activity): def country_bar(customer, type_of_activity):
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index() company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
@ -207,7 +212,7 @@ def country_bar(customer, type_of_activity):
plt.xlabel('Company') plt.xlabel('Company')
plt.ylabel("Share of French Customer") plt.ylabel("Share of French Customer")
plt.title(f"Share of French Customer for {type_of_activity}") plt.title(f"Share of French Customer for {type_of_activity}")
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
plt.show() plt.show()
save_file_s3("country_bar_", type_of_activity) save_file_s3("country_bar_", type_of_activity)
@ -219,7 +224,7 @@ def lazy_customer_plot(campaigns_kpi, type_of_activity):
plt.xlabel('Company') plt.xlabel('Company')
plt.ylabel("Share of Customers who did not open mail") plt.ylabel("Share of Customers who did not open mail")
plt.title(f"Share of Customers who did not open mail for {type_of_activity}") plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
plt.show() plt.show()
save_file_s3("lazy_customer_", type_of_activity) save_file_s3("lazy_customer_", type_of_activity)
@ -234,6 +239,7 @@ def campaigns_effectiveness(customer, type_of_activity):
plt.ylabel("Number of Customers (thousands)") plt.ylabel("Number of Customers (thousands)")
plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}") plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
plt.legend() plt.legend()
plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
plt.show() plt.show()
save_file_s3("campaigns_effectiveness_", type_of_activity) save_file_s3("campaigns_effectiveness_", type_of_activity)
@ -243,45 +249,56 @@ def sale_dynamics(products, campaigns_brut, type_of_activity):
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True) purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event']) purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m')) purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
# Mois du premier mails
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index() first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True) first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception']) first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m')) first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
# Fusion
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer') first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
# Mois à partir duquel le client est considere comme connu
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601') known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
# Nombre de commande par mois
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner') purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1) purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m')) purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
purchases_count = purchases_count[purchases_count['customer_id'] != 1] purchases_count = purchases_count[purchases_count['customer_id'] != 1]
# Nombre de commande par mois par type de client
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index() nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True) nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index() nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True) nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
# Graphique en nombre de commande
purchases_graph = nb_purchases_graph purchases_graph = nb_purchases_graph
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)] purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False] purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True] purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
plt.bar(purchases_graph_used_0["purchase_date_month"], purchases_graph_used_0["nb_purchases"], width=12, label = "Nouveau client")
plt.bar(purchases_graph_used_0["purchase_date_month"], purchases_graph_used_1["nb_purchases"],
bottom = purchases_graph_used_0["nb_purchases"], width=12, label = "Ancien client")
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
# commande pr afficher slt
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y')) plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
plt.xlabel('Month') plt.xlabel('Month')
plt.ylabel("Number of Sales") plt.ylabel("Number of Sales")
plt.title(f"Number of Sales for {type_of_activity}") plt.title(f"Number of Sales for {type_of_activity}")
plt.legend() plt.legend()
plt.show() plt.show()
save_file_s3("sale_dynamics_", type_of_activity) save_file_s3("sale_dynamics_", type_of_activity)
@ -295,7 +312,7 @@ def tickets_internet(tickets, type_of_activity):
plt.xlabel('Company') plt.xlabel('Company')
plt.ylabel("Share of Tickets Bought Online") plt.ylabel("Share of Tickets Bought Online")
plt.title(f"Share of Tickets Bought Online for {type_of_activity}") plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
plt.show() plt.show()
save_file_s3("tickets_internet_", type_of_activity) save_file_s3("tickets_internet_", type_of_activity)
@ -304,7 +321,7 @@ def box_plot_price_tickets(tickets, type_of_activity):
price_tickets = tickets[(tickets['total_amount'] > 0)] price_tickets = tickets[(tickets['total_amount'] > 0)]
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True) sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
plt.title(f"Box plot of price tickets for {type_of_activity}") plt.title(f"Box plot of price tickets for {type_of_activity}")
plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
plt.show() plt.show()
save_file_s3("box_plot_price_tickets_", type_of_activity) save_file_s3("box_plot_price_tickets_", type_of_activity)