fix errors
This commit is contained in:
		
							parent
							
								
									db6eaaaa8d
								
							
						
					
					
						commit
						15c102682a
					
				
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import numpy as np
 | 
			
		||||
import os
 | 
			
		||||
import io
 | 
			
		||||
import s3fs
 | 
			
		||||
import re
 | 
			
		||||
import warnings
 | 
			
		||||
| 
						 | 
				
			
			@ -16,7 +17,7 @@ S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 | 
			
		|||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 | 
			
		||||
 | 
			
		||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
 | 
			
		||||
            'sport': ['5', '6'],
 | 
			
		||||
            'sport': ['5'],
 | 
			
		||||
            'musique' : ['10', '11', '12', '13', '14']}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -32,17 +33,17 @@ outlier_list = outlier_detection(tickets, list_of_comp)
 | 
			
		|||
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
 | 
			
		||||
customer_valid_list = valid_customer_detection(products, campaigns_brut)
 | 
			
		||||
 | 
			
		||||
# Identify customer who bought during the period of y
 | 
			
		||||
consumer_target_period = identify_purchase_during_target_periode(products)
 | 
			
		||||
 | 
			
		||||
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
 | 
			
		||||
 | 
			
		||||
for dataset in databases:
 | 
			
		||||
    dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
 | 
			
		||||
    dataset['customer_id'] = dataset['customer_id'].isin(customer_valid_list) # keep only valid customer
 | 
			
		||||
    dataset['has_purchased_target_period'] = np.where(dataset['customer_id'].isin(customer_valid_list), 1, 0)
 | 
			
		||||
    dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
 | 
			
		||||
    #print(f'shape of {dataset} : ', dataset.shape)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
# Identify customer who bought during the period of y
 | 
			
		||||
customer_target_period = identify_purchase_during_target_periode(products)
 | 
			
		||||
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
 | 
			
		||||
 | 
			
		||||
# Generate graph and automatically saved them in the bucket
 | 
			
		||||
compute_nb_clients(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -52,16 +53,16 @@ mailing_consent(customer, type_of_activity)
 | 
			
		|||
 | 
			
		||||
mailing_consent_by_target(customer)
 | 
			
		||||
 | 
			
		||||
#gender_bar(customer, type_of_activity)
 | 
			
		||||
gender_bar(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
#country_bar(customer, type_of_activity)
 | 
			
		||||
country_bar(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
#lazy_customer_plot(campaigns_kpi, type_of_activity)
 | 
			
		||||
lazy_customer_plot(campaigns_kpi, type_of_activity)
 | 
			
		||||
 | 
			
		||||
#campaigns_effectiveness(customer, type_of_activity)
 | 
			
		||||
 | 
			
		||||
#sale_dynamics(products, campaigns_brut, type_of_activity)
 | 
			
		||||
sale_dynamics(products, campaigns_brut, type_of_activity)
 | 
			
		||||
 | 
			
		||||
#tickets_internet(tickets, type_of_activity)
 | 
			
		||||
tickets_internet(tickets, type_of_activity)
 | 
			
		||||
 | 
			
		||||
#box_plot_price_tickets(tickets, type_of_activity)
 | 
			
		||||
box_plot_price_tickets(tickets, type_of_activity)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
import pandas as pd
 | 
			
		||||
import os
 | 
			
		||||
import s3fs
 | 
			
		||||
import io
 | 
			
		||||
import warnings
 | 
			
		||||
from datetime import date, timedelta, datetime
 | 
			
		||||
import numpy as np
 | 
			
		||||
| 
						 | 
				
			
			@ -53,10 +54,14 @@ def load_files(nb_compagnie):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def save_file_s3(File_name, type_of_activity):
 | 
			
		||||
    image_buffer = io.BytesIO()
 | 
			
		||||
    plt.savefig(image_buffer, format='png')
 | 
			
		||||
    image_buffer.seek(0)
 | 
			
		||||
    FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
 | 
			
		||||
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
 | 
			
		||||
    with fs.open(FILE_PATH_OUT_S3, 'wb') as file_out:
 | 
			
		||||
        plt.savefig(file_out)
 | 
			
		||||
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
 | 
			
		||||
        s3_file.write(image_buffer.read())
 | 
			
		||||
    plt.close()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def outlier_detection(tickets, company_list, show_diagram=False):
 | 
			
		||||
| 
						 | 
				
			
			@ -72,7 +77,7 @@ def outlier_detection(tickets, company_list, show_diagram=False):
 | 
			
		|||
        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
 | 
			
		||||
        #print('df circulaire : ', df_circulaire.head())
 | 
			
		||||
        top = df_circulaire[:1]
 | 
			
		||||
        print('top : ', top)
 | 
			
		||||
        #print('top : ', top)
 | 
			
		||||
        outlier_list.append(top.index[0])
 | 
			
		||||
        rest = df_circulaire[1:]
 | 
			
		||||
    
 | 
			
		||||
| 
						 | 
				
			
			@ -101,9 +106,10 @@ def valid_customer_detection(products, campaigns_brut):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def identify_purchase_during_target_periode(products):
 | 
			
		||||
    products_target_period = products[(products['purchase_date']>="2022-11-01") & (products['purchase_date']<="2023-11-01")]
 | 
			
		||||
    consumer_target_period = products_target_period['customer_id'].to_list()
 | 
			
		||||
    return consumer_target_period
 | 
			
		||||
    products_target_period = products[(products['purchase_date']>="2022-11-01")
 | 
			
		||||
    & (products['purchase_date']<="2023-11-01")]
 | 
			
		||||
    customer_target_period = products_target_period['customer_id'].to_list()
 | 
			
		||||
    return customer_target_period
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
def remove_elements(lst, elements_to_remove):
 | 
			
		||||
| 
						 | 
				
			
			@ -117,7 +123,7 @@ def compute_nb_clients(customer, type_of_activity):
 | 
			
		|||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Number of clients (thousands)")
 | 
			
		||||
    plt.title(f"Number of clients for {type_of_activity}")
 | 
			
		||||
    
 | 
			
		||||
    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("nb_clients_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -129,7 +135,7 @@ def maximum_price_paid(customer, type_of_activity):
 | 
			
		|||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Maximal price of a ticket Prix")
 | 
			
		||||
    plt.title(f"Maximal price of a ticket for {type_of_activity}")
 | 
			
		||||
    
 | 
			
		||||
    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("Maximal_price_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -140,9 +146,9 @@ def mailing_consent(customer, type_of_activity):
 | 
			
		|||
    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
 | 
			
		||||
 | 
			
		||||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel('Company')
 | 
			
		||||
    plt.ylabel('Consent')
 | 
			
		||||
    plt.title(f'Consent of mailing for {type_of_activity}')
 | 
			
		||||
 | 
			
		||||
    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("mailing_consent_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -169,7 +175,7 @@ def mailing_consent_by_target(customer):
 | 
			
		|||
    
 | 
			
		||||
    # Ajout des étiquettes, de la légende, etc.
 | 
			
		||||
    ax.set_xlabel('Company')
 | 
			
		||||
    ax.set_ylabel('Company')
 | 
			
		||||
    ax.set_ylabel('Consent')
 | 
			
		||||
    ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
 | 
			
		||||
    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
 | 
			
		||||
    ax.set_xticklabels(categories)
 | 
			
		||||
| 
						 | 
				
			
			@ -183,6 +189,7 @@ def mailing_consent_by_target(customer):
 | 
			
		|||
def gender_bar(customer, type_of_activity):
 | 
			
		||||
    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
 | 
			
		||||
    
 | 
			
		||||
    # Création du barplot
 | 
			
		||||
    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
 | 
			
		||||
    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
 | 
			
		||||
            bottom = company_genders["gender_male"], label = "Femme")
 | 
			
		||||
| 
						 | 
				
			
			@ -193,12 +200,10 @@ def gender_bar(customer, type_of_activity):
 | 
			
		|||
    plt.ylabel("Gender")
 | 
			
		||||
    plt.title(f"Gender of Customer for {type_of_activity}")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
 | 
			
		||||
    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
 | 
			
		||||
 
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("gender_bar_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
def country_bar(customer, type_of_activity):
 | 
			
		||||
    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
 | 
			
		||||
| 
						 | 
				
			
			@ -207,7 +212,7 @@ def country_bar(customer, type_of_activity):
 | 
			
		|||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of French Customer")
 | 
			
		||||
    plt.title(f"Share of French Customer for {type_of_activity}")
 | 
			
		||||
    
 | 
			
		||||
    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("country_bar_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -219,7 +224,7 @@ def lazy_customer_plot(campaigns_kpi, type_of_activity):
 | 
			
		|||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of Customers who did not open mail")
 | 
			
		||||
    plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
 | 
			
		||||
    
 | 
			
		||||
    plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("lazy_customer_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -234,6 +239,7 @@ def campaigns_effectiveness(customer, type_of_activity):
 | 
			
		|||
    plt.ylabel("Number of Customers (thousands)")
 | 
			
		||||
    plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
    plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("campaigns_effectiveness_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -243,45 +249,56 @@ def sale_dynamics(products, campaigns_brut, type_of_activity):
 | 
			
		|||
    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
 | 
			
		||||
    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
 | 
			
		||||
    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    # Mois du premier mails
 | 
			
		||||
    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
 | 
			
		||||
    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
 | 
			
		||||
    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
 | 
			
		||||
    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    # Fusion 
 | 
			
		||||
    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
 | 
			
		||||
                          first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
 | 
			
		||||
 | 
			
		||||
                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
 | 
			
		||||
    
 | 
			
		||||
    # Mois à partir duquel le client est considere comme connu
 | 
			
		||||
    
 | 
			
		||||
    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    # Nombre de commande par mois
 | 
			
		||||
    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
 | 
			
		||||
    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
 | 
			
		||||
    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
 | 
			
		||||
    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
 | 
			
		||||
    
 | 
			
		||||
    # Nombre de commande par mois par type de client
 | 
			
		||||
    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
 | 
			
		||||
    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
 | 
			
		||||
    
 | 
			
		||||
    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
 | 
			
		||||
    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    # Graphique en nombre de commande
 | 
			
		||||
    purchases_graph = nb_purchases_graph
 | 
			
		||||
    
 | 
			
		||||
    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
 | 
			
		||||
    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
 | 
			
		||||
    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
 | 
			
		||||
    
 | 
			
		||||
    plt.bar(purchases_graph_used_0["purchase_date_month"], purchases_graph_used_0["nb_purchases"], width=12, label = "Nouveau client")
 | 
			
		||||
    plt.bar(purchases_graph_used_0["purchase_date_month"], purchases_graph_used_1["nb_purchases"], 
 | 
			
		||||
            bottom = purchases_graph_used_0["nb_purchases"], width=12, label = "Ancien client")
 | 
			
		||||
    
 | 
			
		||||
    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
 | 
			
		||||
    
 | 
			
		||||
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
 | 
			
		||||
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
 | 
			
		||||
            bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
 | 
			
		||||
    
 | 
			
		||||
    
 | 
			
		||||
    # commande pr afficher slt
 | 
			
		||||
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
 | 
			
		||||
    
 | 
			
		||||
    plt.xlabel('Month')
 | 
			
		||||
    plt.ylabel("Number of Sales")
 | 
			
		||||
    plt.title(f"Number of Sales for {type_of_activity}")
 | 
			
		||||
    plt.legend()
 | 
			
		||||
    
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("sale_dynamics_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -295,7 +312,7 @@ def tickets_internet(tickets, type_of_activity):
 | 
			
		|||
    plt.xlabel('Company')
 | 
			
		||||
    plt.ylabel("Share of Tickets Bought Online")
 | 
			
		||||
    plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
 | 
			
		||||
    
 | 
			
		||||
    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("tickets_internet_", type_of_activity)
 | 
			
		||||
    
 | 
			
		||||
| 
						 | 
				
			
			@ -304,7 +321,7 @@ def box_plot_price_tickets(tickets, type_of_activity):
 | 
			
		|||
    price_tickets = tickets[(tickets['total_amount'] > 0)]
 | 
			
		||||
    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
 | 
			
		||||
    plt.title(f"Box plot of price tickets for {type_of_activity}")
 | 
			
		||||
    
 | 
			
		||||
    plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
 | 
			
		||||
    plt.show()
 | 
			
		||||
    save_file_s3("box_plot_price_tickets_", type_of_activity)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user