fix errors

This commit is contained in:
Alexis REVELLE 2024-03-21 10:47:40 +00:00
parent 089a8fd3d6
commit 52fd738fe5
3 changed files with 38 additions and 6 deletions

View File

@ -47,7 +47,9 @@ customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(
# Generate graph and automatically saved them in the bucket
compute_nb_clients(customer, type_of_activity)
maximum_price_paid(customer, type_of_activity)
#maximum_price_paid(customer, type_of_activity)
target_proportion(customer, type_of_activity)
mailing_consent(customer, type_of_activity)

View File

@ -55,16 +55,16 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes =
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',
'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet',
'nb_campaigns', 'nb_campaigns_opened']
numeric_transformer = Pipeline(steps=[
#("imputer", SimpleImputer(strategy="mean")),
("scaler", StandardScaler())
])
categorical_features = ['opt_in']
categorical_features = ['opt_in', 'gender_male', 'gender_female']
# Transformer for the categorical features
categorical_transformer = Pipeline(steps=[

View File

@ -43,6 +43,13 @@ def load_files(nb_compagnie):
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
# Remove companies' outliers
df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
# harmonize set of customers across databases
customer_id = df_tickets_kpi['customer_id'].to_list()
for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
dataset = dataset[dataset['customer_id'].isin(customer_id)]
# Concaténation
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
@ -53,6 +60,16 @@ def load_files(nb_compagnie):
return customer, campaigns_kpi, campaigns_brut, tickets, products
def remove_outlier_total_amount(tickets):
Q1 = tickets['total_amount'].quantile(0.25)
Q3 = tickets['total_amount'].quantile(0.75)
IQR = Q3 - Q1
upper = Q3 +1.5*IQR
outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
tickets = tickets[~tickets['customer_id'].isin(outliers)]
return tickets
def save_file_s3(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
@ -140,6 +157,19 @@ def maximum_price_paid(customer, type_of_activity):
save_file_s3("Maximal_price_", type_of_activity)
def target_proportion(customer, type_of_activity):
df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
'customer_id' : 'nunique'}).reset_index()
df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
plt.xlabel('Company Number')
plt.ylabel('Share (%)')
plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
plt.show()
save_file_s3("share_target_", type_of_activity)
def mailing_consent(customer, type_of_activity):
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
mailing_consent["opt_in"] *= 100