def load_files(nb_compagnie): customer = pd.DataFrame() campaigns_brut = pd.DataFrame() campaigns_kpi = pd.DataFrame() products = pd.DataFrame() tickets = pd.DataFrame() targets = pd.DataFrame() # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle for directory_path in nb_compagnie: df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned") df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date']) df_target_information = display_input_databases(directory_path, file_name = "target_information") df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC')) df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced) df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0) df_target_KPI = targets_KPI(df_target = df_target_information) # Merge and df_target_KPI = pd.merge(df_customerplus_clean_0[['customer_id']], df_target_KPI, how = 'left', on = 'customer_id') targets_columns = list(df_target_KPI.columns) targets_columns.remove('customer_id') df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0) # creation de la colonne Number compagnie, qui permettra d'agréger les résultats df_tickets_kpi["number_company"]=int(directory_path) df_campaigns_brut["number_company"]=int(directory_path) df_campaigns_kpi["number_company"]=int(directory_path) df_customerplus_clean["number_company"]=int(directory_path) df_target_information["number_company"]=int(directory_path) df_target_KPI["number_company"]=int(directory_path) # Traitement des index df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str') df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str') df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str') df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str') # Remove companies' outliers df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi) # harmonize set of customers across databases customer_id = df_tickets_kpi['customer_id'].to_list() for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]: dataset = dataset[dataset['customer_id'].isin(customer_id)] df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str') # Concaténation customer = pd.concat([customer, df_customerplus_clean], ignore_index=True) campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True) campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True) products = pd.concat([products, df_products_purchased_reduced], ignore_index=True) targets = pd.concat([targets, df_target_KPI], ignore_index=True) return customer, campaigns_kpi, campaigns_brut, tickets, products, targets def remove_outlier_total_amount(tickets): Q1 = tickets['total_amount'].quantile(0.25) Q3 = tickets['total_amount'].quantile(0.75) IQR = Q3 - Q1 upper = Q3 +1.5*IQR outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list() tickets = tickets[~tickets['customer_id'].isin(outliers)] return tickets def save_file_s3(File_name, type_of_activity): image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png', pad_inches=1, bbox_inches="tight") image_buffer.seek(0) FILE_PATH = f"projet-bdc2324-team1/2_Output/2_0_Descriptive_Statistics/{type_of_activity}/" FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png' with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: s3_file.write(image_buffer.read()) plt.close() def outlier_detection(tickets, company_list, show_diagram=False): outlier_list = list() for company in company_list: total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index() total_amount_share['CA'] = total_amount_share['total_amount'].sum() total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA'] total_amount_share_index = total_amount_share.set_index('customer_id') df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False) #print('df circulaire : ', df_circulaire.head()) top = df_circulaire[:1] #print('top : ', top) outlier_list.append(top.index[0]) rest = df_circulaire[1:] rest_sum = rest.sum() new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])]) if show_diagram: plt.figure(figsize=(3, 3)) plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5) plt.axis('equal') # plt.title(f'Répartition des montants totaux pour la compagnie {company}') plt.show() return outlier_list def valid_customer_detection(products, campaigns_brut): products_valid = products[products['purchase_date']>="2021-05-01"] consumer_valid_product = products_valid['customer_id'].to_list() campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"] consumer_valid_campaigns = campaigns_valid['customer_id'].to_list() consumer_valid = consumer_valid_product + consumer_valid_campaigns return consumer_valid def identify_purchase_during_target_periode(products): products_target_period = products[(products['purchase_date']>="2022-11-01") & (products['purchase_date']<="2023-11-01")] customer_target_period = products_target_period['customer_id'].to_list() return customer_target_period def remove_elements(lst, elements_to_remove): return ''.join([x for x in lst if x not in elements_to_remove]) def compute_nb_clients(customer, type_of_activity): company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index() plt.figure(figsize=(4,3)) plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000) plt.xlabel('Company Number') plt.ylabel("Number of clients (thousands)") # plt.title(f"Number of clients Across {type_of_activity} Companies") plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]]) plt.show() save_file_s3("nb_clients_", type_of_activity) def maximum_price_paid(customer, type_of_activity): company_max_price = customer.groupby("number_company")["max_price"].max().reset_index() plt.bar(company_max_price["number_company"], company_max_price["max_price"]) plt.xlabel('Company Number') plt.ylabel("Maximal price of a ticket Prix") # plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies") plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]]) plt.show() save_file_s3("Maximal_price_", type_of_activity) def target_proportion(customer, type_of_activity): df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum', 'customer_id' : 'nunique'}).reset_index() df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100 plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"]) plt.xlabel('Company Number') plt.ylabel('Share (%)') # plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies') plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]]) plt.show() save_file_s3("share_target_", type_of_activity) def mailing_consent(customer, type_of_activity): mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index() mailing_consent["opt_in"] *= 100 plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"]) plt.xlabel('Company Number') plt.ylabel('Mailing Consent (%)') # plt.title(f'Consent of mailing Across {type_of_activity} Companies') plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]]) plt.show() save_file_s3("mailing_consent_", type_of_activity) def mailing_consent_by_target(customer): df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index() # Création du barplot groupé fig, ax = plt.subplots(figsize=(5, 3)) categories = df_graph["number_company"].unique() bar_width = 0.35 bar_positions = np.arange(len(categories)) # Grouper les données par label et créer les barres groupées for label in df_graph["has_purchased_target_period"].unique(): label_data = df_graph[df_graph['has_purchased_target_period'] == label] values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories] label_printed = "Purchase" if label else "No purchase" ax.bar(bar_positions, values, bar_width, label=label_printed) # Mise à jour des positions des barres pour le prochain groupe bar_positions = [pos + bar_width for pos in bar_positions] # Ajout des étiquettes, de la légende, etc. ax.set_xlabel('Company Number') ax.set_ylabel('Mailing Consent (%)') # ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies') ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) ax.set_xticklabels(categories) ax.legend() # Affichage du plot plt.show() save_file_s3("mailing_consent_target_", type_of_activity) def gender_bar(customer, type_of_activity): company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index() company_genders["gender_male"] *= 100 company_genders["gender_female"] *= 100 company_genders["gender_other"] *= 100 # Création du barplot plt.figure(figsize=(4,3)) plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male") plt.bar(company_genders["number_company"], company_genders["gender_female"], bottom = company_genders["gender_male"], label = "Female") plt.bar(company_genders["number_company"], company_genders["gender_other"], bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown") plt.xlabel('Company Number') plt.ylabel("Frequency (%)") # plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies") plt.legend() plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]]) plt.show() save_file_s3("gender_bar_", type_of_activity) def country_bar(customer, type_of_activity): company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index() company_country_fr["country_fr"] *= 100 plt.figure(figsize=(4,3)) plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"]) plt.xlabel('Company Number') plt.ylabel("Share of French Customer (%)") # plt.title(f"Share of French Customer Across {type_of_activity} Companies") plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]]) plt.show() save_file_s3("country_bar_", type_of_activity) def lazy_customer_plot(campaigns_kpi, type_of_activity): company_lazy_customers = campaigns_kpi.groupby("number_company")[["nb_campaigns", "taux_ouverture_mail"]].mean().reset_index() company_lazy_customers["taux_ouverture_mail"] *= 100 # Initialize the figure fig, ax1 = plt.subplots(figsize=(6, 3)) width = 0.4 x = range(len(company_lazy_customers)) # Plot the bars for "nb_campaigns" on the first y-axis ax1.bar([i - width/2 for i in x], company_lazy_customers['nb_campaigns'], width=width, align='center', label='Amount of Campaigns', color = 'steelblue') # Set labels and title for the first y-axis ax1.set_ylabel('Number of Mails Received', color='steelblue') ax1.tick_params(axis='y', labelcolor='steelblue') # Create another y-axis for "taux_ouverture_mail" ax2 = ax1.twinx() # Plot the bars for "taux_ouverture_mail" on the second y-axis ax2.bar([i + width/2 for i in x], company_lazy_customers['taux_ouverture_mail'], width=width, align='center', label='Open Mail Rate', color = 'darkorange') # Set labels and title for the second y-axis ax2.set_ylabel('Open Mail Rate (%)', color='darkorange') ax2.tick_params(axis='y', labelcolor='darkorange') # Set x-axis ticks and labels ax1.set_xticks(x) ax1.set_xticklabels(company_lazy_customers['number_company']) plt.show() save_file_s3("lazy_customer_", type_of_activity) def campaigns_effectiveness(customer, type_of_activity): campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index() fig, ax = plt.subplots(figsize=(5, 3)) categories = campaigns_effectiveness["number_company"].unique() bar_width = 0.35 bar_positions = np.arange(len(categories)) # Grouper les données par label et créer les barres groupées for label in campaigns_effectiveness["has_purchased_target_period"].unique(): label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label] values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories] label_printed = "Purchase" if label else "No purchase" ax.bar(bar_positions, values, bar_width, label=label_printed) # Mise à jour des positions des barres pour le prochain groupe bar_positions = [pos + bar_width for pos in bar_positions] # Ajout des étiquettes, de la légende, etc. ax.set_xlabel('Company Number') ax.set_ylabel('Share of Consent (%)') # ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)") ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) ax.set_xticklabels(categories) ax.legend() plt.show() save_file_s3("campaigns_effectiveness_", type_of_activity) def sale_dynamics(products, campaigns_brut, type_of_activity): purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index() purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True) purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event']) purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m')) # Mois du premier mails first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index() first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True) first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception']) first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m')) # Fusion known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer') # Mois à partir duquel le client est considere comme connu known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601') # Nombre de commande par mois purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner') purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1) purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m')) purchases_count = purchases_count[purchases_count['customer_id'] != 1] # Nombre de commande par mois par type de client nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index() nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True) nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index() nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True) # Graphique en nombre de commande purchases_graph = nb_purchases_graph purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)] purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False] purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True] merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old")) plt.figure(figsize=(5.5,4)) plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers") plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers") # commande pr afficher slt plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y')) plt.xlabel('Month') plt.ylabel("Number of Sales") # plt.title(f"Number of Sales Across {type_of_activity} Companies") plt.legend() plt.show() save_file_s3("sale_dynamics_", type_of_activity) def tickets_internet(tickets, type_of_activity): nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index() nb_tickets_internet['prop_purchases_internet'] *=100 plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"]) plt.xlabel('Company Number') plt.ylabel("Share of Purchases Bought Online (%)") # plt.title(f"Share of Online Purchases Across {type_of_activity} Companies") plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]]) plt.show() save_file_s3("tickets_internet_", type_of_activity) def already_bought_online(tickets, type_of_activity): nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum', 'customer_id' : 'nunique'} ).reset_index()) nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100 plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"]) plt.xlabel('Company Number') plt.ylabel("Share of Customer who Bought Online at least once (%)") # plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies") plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]]) plt.show() save_file_s3("First_buy_internet_", type_of_activity) def box_plot_price_tickets(tickets, type_of_activity): price_tickets = tickets[(tickets['total_amount'] > 0)] plt.figure(figsize=(4,3)) sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True) # plt.title(f"Box plot of price tickets Across {type_of_activity} Companies") plt.xlabel('Company Number') plt.ylabel("Total Amount Spent") plt.show() save_file_s3("box_plot_price_tickets_", type_of_activity) def target_description(targets, type_of_activity): describe_target = targets.groupby('number_company').agg( prop_target_jeune=('target_jeune', lambda x: (x.sum() / x.count())*100), prop_target_scolaire=('target_scolaire', lambda x: (x.sum() / x.count())*100), prop_target_entreprise=('target_entreprise', lambda x: (x.sum() / x.count())*100), prop_target_famille=('target_famille', lambda x: (x.sum() / x.count())*100), prop_target_optin=('target_optin', lambda x: (x.sum() / x.count())*100), prop_target_optout=('target_optout', lambda x: (x.sum() / x.count())*100), prop_target_newsletter=('target_newsletter', lambda x: (x.sum() / x.count())*100), prop_target_abonne=('target_abonne', lambda x: (x.sum() / x.count())*100)) plot = describe_target.plot.bar() # Adding a title # plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies") # Adding labels for x and y axes plot.set_xlabel("Company Number") plot.set_ylabel("Target Proportion") plot.set_xticklabels(plot.get_xticklabels(), rotation=0, horizontalalignment='center') # Adding a legend plot.legend(["Youth", "School", "Enterprise", "Family", "Optin", "Optout", "Newsletter", "Subscriber"], title="Target Category") save_file_s3("target_category_proportion_", type_of_activity)