Suppression des notebooks exploratoires et brouillons
This commit is contained in:
parent
9ca22fb9e7
commit
4ed6bd809d
File diff suppressed because one or more lines are too long
|
@ -1,68 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import os
|
|
||||||
import io
|
|
||||||
import s3fs
|
|
||||||
import re
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
# Ignore warning
|
|
||||||
warnings.filterwarnings('ignore')
|
|
||||||
|
|
||||||
exec(open('../0_KPI_functions.py').read())
|
|
||||||
exec(open('plot.py').read())
|
|
||||||
|
|
||||||
# Create filesystem object
|
|
||||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
|
||||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
|
||||||
|
|
||||||
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
|
|
||||||
'sport': ['5'],
|
|
||||||
'musique' : ['10', '11', '12', '13', '14']}
|
|
||||||
|
|
||||||
|
|
||||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
|
||||||
list_of_comp = companies[type_of_activity]
|
|
||||||
|
|
||||||
# Load files
|
|
||||||
customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
|
|
||||||
|
|
||||||
# Identify anonymous customer for each company and remove them from our datasets
|
|
||||||
outlier_list = outlier_detection(tickets, list_of_comp)
|
|
||||||
|
|
||||||
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
|
|
||||||
customer_valid_list = valid_customer_detection(products, campaigns_brut)
|
|
||||||
|
|
||||||
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
|
|
||||||
|
|
||||||
for dataset in databases:
|
|
||||||
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
|
|
||||||
dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
|
|
||||||
#print(f'shape of {dataset} : ', dataset.shape)
|
|
||||||
|
|
||||||
# Identify customer who bought during the period of y
|
|
||||||
customer_target_period = identify_purchase_during_target_periode(products)
|
|
||||||
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
|
|
||||||
|
|
||||||
# Generate graph and automatically saved them in the bucket
|
|
||||||
compute_nb_clients(customer, type_of_activity)
|
|
||||||
|
|
||||||
maximum_price_paid(customer, type_of_activity)
|
|
||||||
|
|
||||||
mailing_consent(customer, type_of_activity)
|
|
||||||
|
|
||||||
mailing_consent_by_target(customer)
|
|
||||||
|
|
||||||
gender_bar(customer, type_of_activity)
|
|
||||||
|
|
||||||
country_bar(customer, type_of_activity)
|
|
||||||
|
|
||||||
lazy_customer_plot(campaigns_kpi, type_of_activity)
|
|
||||||
|
|
||||||
#campaigns_effectiveness(customer, type_of_activity)
|
|
||||||
|
|
||||||
sale_dynamics(products, campaigns_brut, type_of_activity)
|
|
||||||
|
|
||||||
tickets_internet(tickets, type_of_activity)
|
|
||||||
|
|
||||||
box_plot_price_tickets(tickets, type_of_activity)
|
|
|
@ -1,328 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import os
|
|
||||||
import s3fs
|
|
||||||
import io
|
|
||||||
import warnings
|
|
||||||
from datetime import date, timedelta, datetime
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import matplotlib.dates as mdates
|
|
||||||
import seaborn as sns
|
|
||||||
|
|
||||||
|
|
||||||
def load_files(nb_compagnie):
|
|
||||||
customer = pd.DataFrame()
|
|
||||||
campaigns_brut = pd.DataFrame()
|
|
||||||
campaigns_kpi = pd.DataFrame()
|
|
||||||
products = pd.DataFrame()
|
|
||||||
tickets = pd.DataFrame()
|
|
||||||
|
|
||||||
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
|
|
||||||
for directory_path in nb_compagnie:
|
|
||||||
df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
|
|
||||||
df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
|
||||||
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
|
||||||
df_target_information = display_databases(directory_path, file_name = "target_information")
|
|
||||||
|
|
||||||
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut)
|
|
||||||
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
|
|
||||||
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
|
|
||||||
|
|
||||||
|
|
||||||
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
|
|
||||||
df_tickets_kpi["number_company"]=int(directory_path)
|
|
||||||
df_campaigns_brut["number_company"]=int(directory_path)
|
|
||||||
df_campaigns_kpi["number_company"]=int(directory_path)
|
|
||||||
df_customerplus_clean["number_company"]=int(directory_path)
|
|
||||||
df_target_information["number_company"]=int(directory_path)
|
|
||||||
|
|
||||||
# Traitement des index
|
|
||||||
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
|
|
||||||
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
|
|
||||||
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
|
|
||||||
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
|
|
||||||
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
|
|
||||||
|
|
||||||
# Concaténation
|
|
||||||
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
|
|
||||||
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
|
|
||||||
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
|
|
||||||
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
|
|
||||||
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
|
|
||||||
|
|
||||||
return customer, campaigns_kpi, campaigns_brut, tickets, products
|
|
||||||
|
|
||||||
|
|
||||||
def save_file_s3(File_name, type_of_activity):
|
|
||||||
image_buffer = io.BytesIO()
|
|
||||||
plt.savefig(image_buffer, format='png')
|
|
||||||
image_buffer.seek(0)
|
|
||||||
FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
|
|
||||||
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
|
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
|
||||||
s3_file.write(image_buffer.read())
|
|
||||||
plt.close()
|
|
||||||
|
|
||||||
|
|
||||||
def outlier_detection(tickets, company_list, show_diagram=False):
|
|
||||||
|
|
||||||
outlier_list = list()
|
|
||||||
|
|
||||||
for company in company_list:
|
|
||||||
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
|
|
||||||
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
|
|
||||||
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
|
|
||||||
|
|
||||||
total_amount_share_index = total_amount_share.set_index('customer_id')
|
|
||||||
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
|
|
||||||
#print('df circulaire : ', df_circulaire.head())
|
|
||||||
top = df_circulaire[:1]
|
|
||||||
#print('top : ', top)
|
|
||||||
outlier_list.append(top.index[0])
|
|
||||||
rest = df_circulaire[1:]
|
|
||||||
|
|
||||||
rest_sum = rest.sum()
|
|
||||||
|
|
||||||
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
|
|
||||||
|
|
||||||
if show_diagram:
|
|
||||||
plt.figure(figsize=(3, 3))
|
|
||||||
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
|
|
||||||
plt.axis('equal')
|
|
||||||
plt.title(f'Répartition des montants totaux pour la compagnie {company}')
|
|
||||||
plt.show()
|
|
||||||
return outlier_list
|
|
||||||
|
|
||||||
|
|
||||||
def valid_customer_detection(products, campaigns_brut):
|
|
||||||
products_valid = products[products['purchase_date']>="2021-05-01"]
|
|
||||||
consumer_valid_product = products_valid['customer_id'].to_list()
|
|
||||||
|
|
||||||
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
|
|
||||||
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
|
|
||||||
|
|
||||||
consumer_valid = consumer_valid_product + consumer_valid_campaigns
|
|
||||||
return consumer_valid
|
|
||||||
|
|
||||||
|
|
||||||
def identify_purchase_during_target_periode(products):
|
|
||||||
products_target_period = products[(products['purchase_date']>="2022-11-01")
|
|
||||||
& (products['purchase_date']<="2023-11-01")]
|
|
||||||
customer_target_period = products_target_period['customer_id'].to_list()
|
|
||||||
return customer_target_period
|
|
||||||
|
|
||||||
|
|
||||||
def remove_elements(lst, elements_to_remove):
|
|
||||||
return ''.join([x for x in lst if x not in elements_to_remove])
|
|
||||||
|
|
||||||
|
|
||||||
def compute_nb_clients(customer, type_of_activity):
|
|
||||||
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
|
|
||||||
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
|
||||||
plt.ylabel("Number of clients (thousands)")
|
|
||||||
plt.title(f"Number of clients for {type_of_activity}")
|
|
||||||
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("nb_clients_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def maximum_price_paid(customer, type_of_activity):
|
|
||||||
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
|
|
||||||
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
|
||||||
plt.ylabel("Maximal price of a ticket Prix")
|
|
||||||
plt.title(f"Maximal price of a ticket for {type_of_activity}")
|
|
||||||
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("Maximal_price_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def mailing_consent(customer, type_of_activity):
|
|
||||||
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
|
|
||||||
|
|
||||||
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
|
||||||
plt.ylabel('Consent')
|
|
||||||
plt.title(f'Consent of mailing for {type_of_activity}')
|
|
||||||
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("mailing_consent_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def mailing_consent_by_target(customer):
|
|
||||||
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
|
|
||||||
# Création du barplot groupé
|
|
||||||
fig, ax = plt.subplots(figsize=(10, 6))
|
|
||||||
|
|
||||||
categories = df_graph["number_company"].unique()
|
|
||||||
bar_width = 0.35
|
|
||||||
bar_positions = np.arange(len(categories))
|
|
||||||
|
|
||||||
# Grouper les données par label et créer les barres groupées
|
|
||||||
for label in df_graph["has_purchased_target_period"].unique():
|
|
||||||
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
|
|
||||||
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
|
||||||
|
|
||||||
label_printed = "purchased" if label else "no purchase"
|
|
||||||
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
|
||||||
|
|
||||||
# Mise à jour des positions des barres pour le prochain groupe
|
|
||||||
bar_positions = [pos + bar_width for pos in bar_positions]
|
|
||||||
|
|
||||||
# Ajout des étiquettes, de la légende, etc.
|
|
||||||
ax.set_xlabel('Company')
|
|
||||||
ax.set_ylabel('Consent')
|
|
||||||
ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
|
|
||||||
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
|
||||||
ax.set_xticklabels(categories)
|
|
||||||
ax.legend()
|
|
||||||
|
|
||||||
# Affichage du plot
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("mailing_consent_target_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def gender_bar(customer, type_of_activity):
|
|
||||||
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
|
|
||||||
|
|
||||||
# Création du barplot
|
|
||||||
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
|
|
||||||
plt.bar(company_genders["number_company"], company_genders["gender_female"],
|
|
||||||
bottom = company_genders["gender_male"], label = "Femme")
|
|
||||||
plt.bar(company_genders["number_company"], company_genders["gender_other"],
|
|
||||||
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
|
||||||
plt.ylabel("Gender")
|
|
||||||
plt.title(f"Gender of Customer for {type_of_activity}")
|
|
||||||
plt.legend()
|
|
||||||
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("gender_bar_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def country_bar(customer, type_of_activity):
|
|
||||||
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
|
|
||||||
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
|
||||||
plt.ylabel("Share of French Customer")
|
|
||||||
plt.title(f"Share of French Customer for {type_of_activity}")
|
|
||||||
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("country_bar_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def lazy_customer_plot(campaigns_kpi, type_of_activity):
|
|
||||||
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
|
|
||||||
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
|
||||||
plt.ylabel("Share of Customers who did not open mail")
|
|
||||||
plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
|
|
||||||
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("lazy_customer_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def campaigns_effectiveness(customer, type_of_activity):
|
|
||||||
|
|
||||||
campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index()
|
|
||||||
|
|
||||||
plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"])
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
|
||||||
plt.ylabel("Number of Customers (thousands)")
|
|
||||||
plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
|
|
||||||
plt.legend()
|
|
||||||
plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("campaigns_effectiveness_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def sale_dynamics(products, campaigns_brut, type_of_activity):
|
|
||||||
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
|
|
||||||
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
|
|
||||||
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
|
|
||||||
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
|
|
||||||
|
|
||||||
# Mois du premier mails
|
|
||||||
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
|
|
||||||
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
|
|
||||||
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
|
|
||||||
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
|
|
||||||
|
|
||||||
# Fusion
|
|
||||||
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
|
|
||||||
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
|
|
||||||
|
|
||||||
# Mois à partir duquel le client est considere comme connu
|
|
||||||
|
|
||||||
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
|
|
||||||
|
|
||||||
# Nombre de commande par mois
|
|
||||||
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
|
|
||||||
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
|
|
||||||
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
|
|
||||||
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
|
|
||||||
|
|
||||||
# Nombre de commande par mois par type de client
|
|
||||||
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
|
|
||||||
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
|
|
||||||
|
|
||||||
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
|
|
||||||
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
|
|
||||||
|
|
||||||
# Graphique en nombre de commande
|
|
||||||
purchases_graph = nb_purchases_graph
|
|
||||||
|
|
||||||
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
|
|
||||||
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
|
|
||||||
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
|
|
||||||
|
|
||||||
|
|
||||||
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
|
|
||||||
|
|
||||||
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
|
|
||||||
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
|
|
||||||
bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
|
|
||||||
|
|
||||||
|
|
||||||
# commande pr afficher slt
|
|
||||||
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
|
|
||||||
|
|
||||||
plt.xlabel('Month')
|
|
||||||
plt.ylabel("Number of Sales")
|
|
||||||
plt.title(f"Number of Sales for {type_of_activity}")
|
|
||||||
plt.legend()
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("sale_dynamics_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def tickets_internet(tickets, type_of_activity):
|
|
||||||
nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
|
|
||||||
nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
|
|
||||||
|
|
||||||
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["Share_ticket_internet"])
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
|
||||||
plt.ylabel("Share of Tickets Bought Online")
|
|
||||||
plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
|
|
||||||
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("tickets_internet_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
||||||
def box_plot_price_tickets(tickets, type_of_activity):
|
|
||||||
price_tickets = tickets[(tickets['total_amount'] > 0)]
|
|
||||||
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
|
|
||||||
plt.title(f"Box plot of price tickets for {type_of_activity}")
|
|
||||||
plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
|
|
||||||
plt.show()
|
|
||||||
save_file_s3("box_plot_price_tickets_", type_of_activity)
|
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,436 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "135a67de-cff8-4345-bacc-d9f9fa68a41f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"import re\n",
|
|
||||||
"from sklearn.linear_model import LogisticRegression\n",
|
|
||||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
|
||||||
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
|
|
||||||
"from sklearn.utils import class_weight\n",
|
|
||||||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
|
||||||
"from sklearn.compose import ColumnTransformer\n",
|
|
||||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
|
||||||
"from sklearn.impute import SimpleImputer\n",
|
|
||||||
"from sklearn.model_selection import GridSearchCV\n",
|
|
||||||
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
|
|
||||||
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
|
|
||||||
"import seaborn as sns\n",
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
|
|
||||||
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
|
|
||||||
"\n",
|
|
||||||
"import statsmodels.api as sm\n",
|
|
||||||
"\n",
|
|
||||||
"import pickle\n",
|
|
||||||
"import warnings"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "9a6254df-d496-4957-89ea-9ed2b74049dd",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Create filesystem object\n",
|
|
||||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
||||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"id": "922cf05f-8343-4ed0-ad62-3ef1f17c0730",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def load_train_test():\n",
|
|
||||||
" BUCKET = \"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/musee\"\n",
|
|
||||||
" File_path_train = BUCKET + \"/Train_set.csv\"\n",
|
|
||||||
" File_path_test = BUCKET + \"/Test_set.csv\"\n",
|
|
||||||
" \n",
|
|
||||||
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
|
|
||||||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
" # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
|
|
||||||
"\n",
|
|
||||||
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
|
|
||||||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
" # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
|
|
||||||
" \n",
|
|
||||||
" return dataset_train, dataset_test\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def features_target_split(dataset_train, dataset_test):\n",
|
|
||||||
" features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',\n",
|
|
||||||
" 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',\n",
|
|
||||||
" 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',\n",
|
|
||||||
" 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',\n",
|
|
||||||
" 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',\n",
|
|
||||||
" 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',\n",
|
|
||||||
" 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',\n",
|
|
||||||
" 'target_jeune', 'target_abonne']\n",
|
|
||||||
" X_train = dataset_train[features_l]\n",
|
|
||||||
" y_train = dataset_train[['y_has_purchased']]\n",
|
|
||||||
"\n",
|
|
||||||
" X_test = dataset_test[features_l]\n",
|
|
||||||
" y_test = dataset_test[['y_has_purchased']]\n",
|
|
||||||
" return X_train, X_test, y_train, y_test"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "2584e454-111b-4c39-881b-676841cb5aa1",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/tmp/ipykernel_498/3950829189.py:7: DtypeWarning: Columns (10,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
||||||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"/tmp/ipykernel_498/3950829189.py:11: DtypeWarning: Columns (10,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
||||||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"dataset_train, dataset_test = load_train_test()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "a32ea7f8-e2d3-44db-8937-5afda9447b58",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 22,
|
|
||||||
"id": "3bdc8840-7f45-416f-8ee0-307db201c496",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"const 0\n",
|
|
||||||
"nb_campaigns 0\n",
|
|
||||||
"taux_ouverture_mail 0\n",
|
|
||||||
"prop_purchases_internet 0\n",
|
|
||||||
"nb_tickets 0\n",
|
|
||||||
"nb_purchases 0\n",
|
|
||||||
"total_amount 0\n",
|
|
||||||
"nb_suppliers 0\n",
|
|
||||||
"time_to_open 0\n",
|
|
||||||
"purchases_10_2021 0\n",
|
|
||||||
"purchases_10_2022 0\n",
|
|
||||||
"purchases_11_2021 0\n",
|
|
||||||
"purchases_12_2021 0\n",
|
|
||||||
"purchases_1_2022 0\n",
|
|
||||||
"purchases_2_2022 0\n",
|
|
||||||
"purchases_3_2022 0\n",
|
|
||||||
"purchases_4_2022 0\n",
|
|
||||||
"purchases_5_2021 0\n",
|
|
||||||
"purchases_5_2022 0\n",
|
|
||||||
"purchases_6_2021 0\n",
|
|
||||||
"purchases_6_2022 0\n",
|
|
||||||
"purchases_7_2021 0\n",
|
|
||||||
"purchases_7_2022 0\n",
|
|
||||||
"purchases_8_2021 0\n",
|
|
||||||
"purchases_8_2022 0\n",
|
|
||||||
"purchases_9_2021 0\n",
|
|
||||||
"purchases_9_2022 0\n",
|
|
||||||
"purchase_date_min 0\n",
|
|
||||||
"purchase_date_max 0\n",
|
|
||||||
"nb_targets 0\n",
|
|
||||||
"gender_female 0\n",
|
|
||||||
"gender_male 0\n",
|
|
||||||
"achat_internet 0\n",
|
|
||||||
"categorie_age_0_10 0\n",
|
|
||||||
"categorie_age_10_20 0\n",
|
|
||||||
"categorie_age_20_30 0\n",
|
|
||||||
"categorie_age_30_40 0\n",
|
|
||||||
"categorie_age_40_50 0\n",
|
|
||||||
"categorie_age_50_60 0\n",
|
|
||||||
"categorie_age_60_70 0\n",
|
|
||||||
"categorie_age_70_80 0\n",
|
|
||||||
"categorie_age_plus_80 0\n",
|
|
||||||
"categorie_age_inconnue 0\n",
|
|
||||||
"country_fr 0\n",
|
|
||||||
"is_profession_known 0\n",
|
|
||||||
"is_zipcode_known 0\n",
|
|
||||||
"opt_in 0\n",
|
|
||||||
"target_optin 0\n",
|
|
||||||
"target_newsletter 0\n",
|
|
||||||
"target_scolaire 0\n",
|
|
||||||
"target_entreprise 0\n",
|
|
||||||
"target_famille 0\n",
|
|
||||||
"target_jeune 0\n",
|
|
||||||
"target_abonne 0\n",
|
|
||||||
"dtype: int64"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 22,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"X_train.isna().sum()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 17,
|
|
||||||
"id": "3c3ac545-52e0-4d0c-afdc-fff70f468a94",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"1.0"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 17,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"most_frequent_value = X_train['country_fr'].mode()[0]\n",
|
|
||||||
"most_frequent_value"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 21,
|
|
||||||
"id": "0fcdc5ee-bcea-4436-be9b-92b79d27a230",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"X_train['country_fr'] = X_train['country_fr'].fillna(most_frequent_value)\n",
|
|
||||||
"X_train['time_to_open'] = X_train['time_to_open'].fillna(0)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "7ecdaf1a-b5e4-4880-871e-363eae6fe4e1",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n",
|
|
||||||
" y = y_train['y_has_purchased'])\n",
|
|
||||||
"\n",
|
|
||||||
"weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"id": "a6b56090-cfe9-4772-810c-d36bf12aceca",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"array([0.52239696, 0.52239696, 0.52239696, ..., 0.52239696, 0.52239696,\n",
|
|
||||||
" 0.52239696])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"\n",
|
|
||||||
"class_counts = np.bincount(y_train['y_has_purchased'])\n",
|
|
||||||
"class_weights = len(y_train['y_has_purchased']) / (2 * class_counts)\n",
|
|
||||||
"\n",
|
|
||||||
"weights = class_weights[y_train['y_has_purchased'].values.astype(int)]\n",
|
|
||||||
"weights"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 13,
|
|
||||||
"id": "bfaea23e-7d7a-4c0d-96f6-4ab4c7c2ff51",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"X_train = sm.add_constant(X_train)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 26,
|
|
||||||
"id": "4cf97ae5-9dcf-4f4c-91b3-3b1f339a6213",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
|
|
||||||
" 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',\n",
|
|
||||||
" 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',\n",
|
|
||||||
" 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 27,
|
|
||||||
"id": "debb36df-3c2f-4cf7-83a9-ad6e4f6b0470",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"scaler = StandardScaler()\n",
|
|
||||||
"\n",
|
|
||||||
"X_train_scaled_columns = scaler.fit_transform(X_train[numeric_features])\n",
|
|
||||||
"\n",
|
|
||||||
"X_train_scaled = X_train.copy() #\n",
|
|
||||||
"X_train_scaled[numeric_features] = X_train_scaled_columns"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 28,
|
|
||||||
"id": "7eaa6160-20a0-4a78-ac38-0411e19707ed",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/opt/mamba/lib/python3.11/site-packages/statsmodels/base/optimizer.py:18: FutureWarning: Keyword arguments have been passed to the optimizer that have no effect. The list of allowed keyword arguments for method newton is: tol, ridge_factor. The list of unsupported keyword arguments passed include: weights. After release 0.14, this will raise.\n",
|
|
||||||
" warnings.warn(\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Optimization terminated successfully.\n",
|
|
||||||
" Current function value: 0.136180\n",
|
|
||||||
" Iterations 9\n",
|
|
||||||
" Logit Regression Results \n",
|
|
||||||
"==============================================================================\n",
|
|
||||||
"Dep. Variable: y_has_purchased No. Observations: 434278\n",
|
|
||||||
"Model: Logit Df Residuals: 434226\n",
|
|
||||||
"Method: MLE Df Model: 51\n",
|
|
||||||
"Date: Thu, 04 Apr 2024 Pseudo R-squ.: 0.2305\n",
|
|
||||||
"Time: 06:09:09 Log-Likelihood: -59140.\n",
|
|
||||||
"converged: True LL-Null: -76855.\n",
|
|
||||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
|
||||||
"===========================================================================================\n",
|
|
||||||
" coef std err z P>|z| [0.025 0.975]\n",
|
|
||||||
"-------------------------------------------------------------------------------------------\n",
|
|
||||||
"const -4.0679 1.65e+06 -2.46e-06 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"nb_campaigns 0.0916 0.012 7.352 0.000 0.067 0.116\n",
|
|
||||||
"taux_ouverture_mail 0.0012 0.011 0.106 0.916 -0.021 0.023\n",
|
|
||||||
"prop_purchases_internet -0.1995 0.067 -2.972 0.003 -0.331 -0.068\n",
|
|
||||||
"nb_tickets 0.5956 0.193 3.091 0.002 0.218 0.973\n",
|
|
||||||
"nb_purchases 0.1598 1.71e+06 9.37e-08 1.000 -3.34e+06 3.34e+06\n",
|
|
||||||
"total_amount -0.1938 0.071 -2.724 0.006 -0.333 -0.054\n",
|
|
||||||
"nb_suppliers 0.0282 0.021 1.348 0.178 -0.013 0.069\n",
|
|
||||||
"time_to_open 0.2785 0.018 15.534 0.000 0.243 0.314\n",
|
|
||||||
"purchases_10_2021 0.0417 4.76e+04 8.76e-07 1.000 -9.34e+04 9.34e+04\n",
|
|
||||||
"purchases_10_2022 0.4578 2.72e+05 1.68e-06 1.000 -5.33e+05 5.33e+05\n",
|
|
||||||
"purchases_11_2021 0.0252 4.92e+04 5.12e-07 1.000 -9.65e+04 9.65e+04\n",
|
|
||||||
"purchases_12_2021 0.0221 6.3e+04 3.5e-07 1.000 -1.24e+05 1.24e+05\n",
|
|
||||||
"purchases_1_2022 0.0083 5.49e+04 1.52e-07 1.000 -1.08e+05 1.08e+05\n",
|
|
||||||
"purchases_2_2022 0.0462 7.59e+04 6.09e-07 1.000 -1.49e+05 1.49e+05\n",
|
|
||||||
"purchases_3_2022 0.0928 1.07e+05 8.67e-07 1.000 -2.1e+05 2.1e+05\n",
|
|
||||||
"purchases_4_2022 0.1446 1.65e+05 8.75e-07 1.000 -3.24e+05 3.24e+05\n",
|
|
||||||
"purchases_5_2021 -0.0427 4.84e+04 -8.83e-07 1.000 -9.48e+04 9.48e+04\n",
|
|
||||||
"purchases_5_2022 0.1412 1.67e+05 8.46e-07 1.000 -3.27e+05 3.27e+05\n",
|
|
||||||
"purchases_6_2021 -0.0252 5.55e+04 -4.54e-07 1.000 -1.09e+05 1.09e+05\n",
|
|
||||||
"purchases_6_2022 0.1246 1.84e+05 6.77e-07 1.000 -3.6e+05 3.6e+05\n",
|
|
||||||
"purchases_7_2021 -0.0252 5.55e+04 -4.55e-07 1.000 -1.09e+05 1.09e+05\n",
|
|
||||||
"purchases_7_2022 -0.0074 2.1e+05 -3.54e-08 1.000 -4.12e+05 4.12e+05\n",
|
|
||||||
"purchases_8_2021 0.0116 5.26e+04 2.21e-07 1.000 -1.03e+05 1.03e+05\n",
|
|
||||||
"purchases_8_2022 0.0554 2.4e+05 2.31e-07 1.000 -4.7e+05 4.7e+05\n",
|
|
||||||
"purchases_9_2021 -0.0320 5.47e+04 -5.85e-07 1.000 -1.07e+05 1.07e+05\n",
|
|
||||||
"purchases_9_2022 0.2349 2.2e+05 1.07e-06 1.000 -4.32e+05 4.32e+05\n",
|
|
||||||
"purchase_date_min 0.0781 0.025 3.092 0.002 0.029 0.128\n",
|
|
||||||
"purchase_date_max -0.5228 0.026 -20.021 0.000 -0.574 -0.472\n",
|
|
||||||
"nb_targets 0.7083 0.010 74.555 0.000 0.690 0.727\n",
|
|
||||||
"gender_female 0.2961 0.038 7.701 0.000 0.221 0.371\n",
|
|
||||||
"gender_male 0.0450 0.040 1.137 0.256 -0.033 0.123\n",
|
|
||||||
"achat_internet 0.1869 0.158 1.186 0.236 -0.122 0.496\n",
|
|
||||||
"categorie_age_0_10 -0.2713 1.65e+06 -1.64e-07 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_10_20 -0.1238 1.65e+06 -7.48e-08 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_20_30 -0.6322 1.65e+06 -3.82e-07 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_30_40 -0.5004 1.65e+06 -3.02e-07 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_40_50 -0.4020 1.65e+06 -2.43e-07 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_50_60 -0.4101 1.65e+06 -2.48e-07 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_60_70 -0.3232 1.65e+06 -1.95e-07 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_70_80 -0.1635 1.65e+06 -9.88e-08 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_plus_80 -0.4677 1.65e+06 -2.83e-07 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"categorie_age_inconnue -0.7737 1.65e+06 -4.68e-07 1.000 -3.24e+06 3.24e+06\n",
|
|
||||||
"country_fr 0.7419 0.065 11.422 0.000 0.615 0.869\n",
|
|
||||||
"is_profession_known -0.5947 0.066 -9.074 0.000 -0.723 -0.466\n",
|
|
||||||
"is_zipcode_known 1.1374 0.027 41.609 0.000 1.084 1.191\n",
|
|
||||||
"opt_in -1.0658 0.030 -35.485 0.000 -1.125 -1.007\n",
|
|
||||||
"target_optin 0.5946 0.034 17.361 0.000 0.527 0.662\n",
|
|
||||||
"target_newsletter -1.0237 0.035 -29.411 0.000 -1.092 -0.955\n",
|
|
||||||
"target_scolaire 0.0428 0.036 1.188 0.235 -0.028 0.113\n",
|
|
||||||
"target_entreprise -0.2645 0.058 -4.589 0.000 -0.377 -0.152\n",
|
|
||||||
"target_famille 0.5035 0.035 14.548 0.000 0.436 0.571\n",
|
|
||||||
"target_jeune -0.6795 0.029 -23.590 0.000 -0.736 -0.623\n",
|
|
||||||
"target_abonne 0.0677 0.037 1.833 0.067 -0.005 0.140\n",
|
|
||||||
"===========================================================================================\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"model_logit = sm.Logit(y_train, X_train_scaled)\n",
|
|
||||||
"\n",
|
|
||||||
"result = model_logit.fit(weights=weights)\n",
|
|
||||||
"\n",
|
|
||||||
"print(result.summary())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "75dc92c7-cc1e-40f1-bc74-0b04043b7e44",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.6"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
@ -1,825 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Business Data Challenge - Team 1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Configuration de l'accès aux données"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "0b029d42-fb02-481e-a407-7e41886198a6",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['bdc2324-data/1',\n",
|
|
||||||
" 'bdc2324-data/10',\n",
|
|
||||||
" 'bdc2324-data/101',\n",
|
|
||||||
" 'bdc2324-data/11',\n",
|
|
||||||
" 'bdc2324-data/12',\n",
|
|
||||||
" 'bdc2324-data/13',\n",
|
|
||||||
" 'bdc2324-data/14',\n",
|
|
||||||
" 'bdc2324-data/2',\n",
|
|
||||||
" 'bdc2324-data/3',\n",
|
|
||||||
" 'bdc2324-data/4',\n",
|
|
||||||
" 'bdc2324-data/5',\n",
|
|
||||||
" 'bdc2324-data/6',\n",
|
|
||||||
" 'bdc2324-data/7',\n",
|
|
||||||
" 'bdc2324-data/8',\n",
|
|
||||||
" 'bdc2324-data/9']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"# Create filesystem object\n",
|
|
||||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
||||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
|
||||||
"\n",
|
|
||||||
"BUCKET = \"bdc2324-data\"\n",
|
|
||||||
"fs.ls(BUCKET)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chargement des fichiers campaign_stats.csv\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Conversion des dates 'sent_at'\n",
|
|
||||||
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
|
|
||||||
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
|
|
||||||
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"2023-11-09 18:10:45+00:00\n",
|
|
||||||
"2020-06-02 08:24:08+00:00\n",
|
|
||||||
"2023-10-12 01:39:48+00:00\n",
|
|
||||||
"2023-10-10 17:06:29+00:00\n",
|
|
||||||
"2023-11-01 09:20:48+00:00\n",
|
|
||||||
"2021-03-31 14:59:02+00:00\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
|
|
||||||
"print(campaign_stats_1['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_1['sent_at'].min())\n",
|
|
||||||
"\n",
|
|
||||||
"print(campaign_stats_2['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_2['sent_at'].min())\n",
|
|
||||||
"\n",
|
|
||||||
"print(campaign_stats_3['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_3['sent_at'].min())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "c75632df-b018-4bb8-a99d-83f15af94369",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"0 2021-03-28 16:01:09+00:00\n",
|
|
||||||
"1 2021-03-28 16:01:09+00:00\n",
|
|
||||||
"2 2021-03-28 16:00:59+00:00\n",
|
|
||||||
"3 2021-03-28 16:00:59+00:00\n",
|
|
||||||
"4 2021-03-28 16:01:06+00:00\n",
|
|
||||||
" ... \n",
|
|
||||||
"6214803 2023-10-23 09:32:33+00:00\n",
|
|
||||||
"6214804 2023-10-23 09:32:49+00:00\n",
|
|
||||||
"6214805 2023-10-23 09:33:28+00:00\n",
|
|
||||||
"6214806 2023-10-23 09:31:53+00:00\n",
|
|
||||||
"6214807 2023-10-23 09:33:54+00:00\n",
|
|
||||||
"Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"campaign_stats_1['sent_at']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Customersplus.csv"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "d3bf880d-1065-4d5b-9954-1830aa5081af",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
||||||
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"id": "7368f381-db8e-4a4d-9fe2-5947eb55be58",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
|
|
||||||
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
|
|
||||||
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
|
|
||||||
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
|
|
||||||
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
|
|
||||||
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
|
|
||||||
" 'average_purchase_delay', 'average_price_basket',\n",
|
|
||||||
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
|
|
||||||
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
|
|
||||||
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
|
|
||||||
" 'tenant_id'],\n",
|
|
||||||
" dtype='object')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.columns"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "08091935-b159-47fa-806c-e1444f3b227e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.shape"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "9f8c8868-c1ac-4cee-af08-533d928f6764",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1['id'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "bf95daf2-4852-4718-b474-207a1ebd8ac4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_2['id'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "1425c385-3216-4e4f-ae8f-a121624721ba",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 61,
|
|
||||||
"id": "92533026-e27c-4f1f-81ca-64eda32a34c0",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 61,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
|
|
||||||
"# Exemple id commun = caractéristiques communes\n",
|
|
||||||
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
|
|
||||||
"\n",
|
|
||||||
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 49,
|
|
||||||
"id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7",
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"id 0.000000\n",
|
|
||||||
"lastname 43.461341\n",
|
|
||||||
"firstname 44.995588\n",
|
|
||||||
"birthdate 96.419870\n",
|
|
||||||
"email 8.622075\n",
|
|
||||||
"street_id 0.000000\n",
|
|
||||||
"created_at 0.000000\n",
|
|
||||||
"updated_at 0.000000\n",
|
|
||||||
"civility 100.000000\n",
|
|
||||||
"is_partner 0.000000\n",
|
|
||||||
"extra 100.000000\n",
|
|
||||||
"deleted_at 100.000000\n",
|
|
||||||
"reference 100.000000\n",
|
|
||||||
"gender 0.000000\n",
|
|
||||||
"is_email_true 0.000000\n",
|
|
||||||
"extra_field 100.000000\n",
|
|
||||||
"identifier 0.000000\n",
|
|
||||||
"opt_in 0.000000\n",
|
|
||||||
"structure_id 88.072380\n",
|
|
||||||
"note 99.403421\n",
|
|
||||||
"profession 95.913503\n",
|
|
||||||
"language 99.280945\n",
|
|
||||||
"mcp_contact_id 34.876141\n",
|
|
||||||
"need_reload 0.000000\n",
|
|
||||||
"last_buying_date 51.653431\n",
|
|
||||||
"max_price 51.653431\n",
|
|
||||||
"ticket_sum 0.000000\n",
|
|
||||||
"average_price 8.639195\n",
|
|
||||||
"fidelity 0.000000\n",
|
|
||||||
"average_purchase_delay 51.653431\n",
|
|
||||||
"average_price_basket 51.653431\n",
|
|
||||||
"average_ticket_basket 51.653431\n",
|
|
||||||
"total_price 43.014236\n",
|
|
||||||
"preferred_category 100.000000\n",
|
|
||||||
"preferred_supplier 100.000000\n",
|
|
||||||
"preferred_formula 100.000000\n",
|
|
||||||
"purchase_count 0.000000\n",
|
|
||||||
"first_buying_date 51.653431\n",
|
|
||||||
"last_visiting_date 100.000000\n",
|
|
||||||
"zipcode 71.176564\n",
|
|
||||||
"country 5.459418\n",
|
|
||||||
"age 96.419870\n",
|
|
||||||
"tenant_id 0.000000\n",
|
|
||||||
"dtype: float64\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"pd.DataFrame(customers_plus_1.isna().mean()*100)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 11,
|
|
||||||
"id": "6d62e73f-3925-490f-9fd4-d0e838903cb2",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chargement de toutes les données\n",
|
|
||||||
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
|
|
||||||
"\n",
|
|
||||||
"for nom_base in liste_base:\n",
|
|
||||||
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
|
|
||||||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>id</th>\n",
|
|
||||||
" <th>lastname</th>\n",
|
|
||||||
" <th>firstname</th>\n",
|
|
||||||
" <th>birthdate</th>\n",
|
|
||||||
" <th>email</th>\n",
|
|
||||||
" <th>street_id</th>\n",
|
|
||||||
" <th>created_at</th>\n",
|
|
||||||
" <th>updated_at</th>\n",
|
|
||||||
" <th>civility</th>\n",
|
|
||||||
" <th>is_partner</th>\n",
|
|
||||||
" <th>...</th>\n",
|
|
||||||
" <th>tenant_id</th>\n",
|
|
||||||
" <th>id_x</th>\n",
|
|
||||||
" <th>customer_id</th>\n",
|
|
||||||
" <th>purchase_date</th>\n",
|
|
||||||
" <th>type_of</th>\n",
|
|
||||||
" <th>is_from_subscription</th>\n",
|
|
||||||
" <th>amount</th>\n",
|
|
||||||
" <th>is_full_price</th>\n",
|
|
||||||
" <th>start_date_time</th>\n",
|
|
||||||
" <th>event_name</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>lastname405082</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>992423</td>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>2023-01-11 17:08:41+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>13.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-02-06 20:00:00+01:00</td>\n",
|
|
||||||
" <td>zaide</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>lastname405082</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>992423</td>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>2023-01-11 17:08:41+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>13.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-02-06 20:00:00+01:00</td>\n",
|
|
||||||
" <td>zaide</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>lastname411168</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1053934</td>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>2023-03-16 16:23:10+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>62.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-03-19 16:00:00+01:00</td>\n",
|
|
||||||
" <td>luisa miller</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>lastname411168</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1053934</td>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>2023-03-16 16:23:10+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>62.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-03-19 16:00:00+01:00</td>\n",
|
|
||||||
" <td>luisa miller</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>4380</td>\n",
|
|
||||||
" <td>lastname4380</td>\n",
|
|
||||||
" <td>firstname4380</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>2021-04-22 14:51:55.432952+02:00</td>\n",
|
|
||||||
" <td>2022-04-14 11:41:33.738500+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1189141</td>\n",
|
|
||||||
" <td>4380</td>\n",
|
|
||||||
" <td>2020-11-26 13:12:53+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>51.3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-12-01 20:00:00+01:00</td>\n",
|
|
||||||
" <td>iphigenie en tauride</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>...</th>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318964</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318965</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318966</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318967</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1244277</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-12-31 11:04:07+01:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>5.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-02-03 20:00:00+01:00</td>\n",
|
|
||||||
" <td>a boire et a manger</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318968</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1244277</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-12-31 11:04:07+01:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>5.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-02-03 20:00:00+01:00</td>\n",
|
|
||||||
" <td>a boire et a manger</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"<p>318969 rows × 52 columns</p>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" id lastname firstname birthdate email \\\n",
|
|
||||||
"0 405082 lastname405082 NaN NaN NaN \n",
|
|
||||||
"1 405082 lastname405082 NaN NaN NaN \n",
|
|
||||||
"2 411168 lastname411168 NaN NaN NaN \n",
|
|
||||||
"3 411168 lastname411168 NaN NaN NaN \n",
|
|
||||||
"4 4380 lastname4380 firstname4380 NaN NaN \n",
|
|
||||||
"... ... ... ... ... ... \n",
|
|
||||||
"318964 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318965 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318966 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318967 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318968 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"\n",
|
|
||||||
" street_id created_at \\\n",
|
|
||||||
"0 6 2023-01-12 06:30:31.197484+01:00 \n",
|
|
||||||
"1 6 2023-01-12 06:30:31.197484+01:00 \n",
|
|
||||||
"2 6 2023-03-17 06:30:35.431967+01:00 \n",
|
|
||||||
"3 6 2023-03-17 06:30:35.431967+01:00 \n",
|
|
||||||
"4 1 2021-04-22 14:51:55.432952+02:00 \n",
|
|
||||||
"... ... ... \n",
|
|
||||||
"318964 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318965 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318966 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318967 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318968 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"\n",
|
|
||||||
" updated_at civility is_partner ... \\\n",
|
|
||||||
"0 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
|
|
||||||
"1 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
|
|
||||||
"2 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
|
|
||||||
"3 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
|
|
||||||
"4 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
|
|
||||||
"... ... ... ... ... \n",
|
|
||||||
"318964 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318965 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318966 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318967 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318968 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"\n",
|
|
||||||
" tenant_id id_x customer_id purchase_date type_of \\\n",
|
|
||||||
"0 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
|
|
||||||
"1 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
|
|
||||||
"2 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
|
|
||||||
"3 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
|
|
||||||
"4 1556 1189141 4380 2020-11-26 13:12:53+01:00 3 \n",
|
|
||||||
"... ... ... ... ... ... \n",
|
|
||||||
"318964 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318965 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318966 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318967 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
|
|
||||||
"318968 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
|
|
||||||
"\n",
|
|
||||||
" is_from_subscription amount is_full_price start_date_time \\\n",
|
|
||||||
"0 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
|
|
||||||
"1 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
|
|
||||||
"2 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
|
|
||||||
"3 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
|
|
||||||
"4 False 51.3 False 2020-12-01 20:00:00+01:00 \n",
|
|
||||||
"... ... ... ... ... \n",
|
|
||||||
"318964 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318965 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318966 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318967 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
|
|
||||||
"318968 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
|
|
||||||
"\n",
|
|
||||||
" event_name \n",
|
|
||||||
"0 zaide \n",
|
|
||||||
"1 zaide \n",
|
|
||||||
"2 luisa miller \n",
|
|
||||||
"3 luisa miller \n",
|
|
||||||
"4 iphigenie en tauride \n",
|
|
||||||
"... ... \n",
|
|
||||||
"318964 entre femmes \n",
|
|
||||||
"318965 entre femmes \n",
|
|
||||||
"318966 entre femmes \n",
|
|
||||||
"318967 a boire et a manger \n",
|
|
||||||
"318968 a boire et a manger \n",
|
|
||||||
"\n",
|
|
||||||
"[318969 rows x 52 columns]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Jointure\n",
|
|
||||||
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
|
|
||||||
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
|
|
||||||
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
|
|
||||||
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
|
|
||||||
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
|
|
||||||
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
|
|
||||||
"df_customer_event"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -1,460 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "bf34b03c-536f-4f93-93a5-e452552653aa",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdin",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Choisissez le type de compagnie : sport ? musique ? musee ? musique\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 10 : 2016-03-07 - 2023-09-25\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 11 : 2015-06-26 - 2023-11-08\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 12 : 2016-06-14 - 2023-11-08\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 13 : 2010-07-31 - 2023-11-08\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
|
|
||||||
"Couverture Company 14 : 1901-01-01 - 2023-11-08\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset test : SUCCESS\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
|
|
||||||
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
|
|
||||||
"Data filtering : SUCCESS\n",
|
|
||||||
"KPIs construction : SUCCESS\n",
|
|
||||||
"Explanatory variable construction : SUCCESS\n",
|
|
||||||
"Explained variable construction : SUCCESS\n",
|
|
||||||
"Exportation dataset train : SUCCESS\n",
|
|
||||||
"FIN DE LA GENERATION DES DATASETS : SUCCESS\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Business Data Challenge - Team 1\n",
|
|
||||||
"\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"import re\n",
|
|
||||||
"import warnings\n",
|
|
||||||
"from datetime import date, timedelta, datetime\n",
|
|
||||||
"\n",
|
|
||||||
"# Create filesystem object\n",
|
|
||||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
||||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Import KPI construction functions\n",
|
|
||||||
"exec(open('0_KPI_functions.py').read())\n",
|
|
||||||
"\n",
|
|
||||||
"# Ignore warning\n",
|
|
||||||
"warnings.filterwarnings('ignore')\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def display_covering_time(df, company, datecover):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" This function draws the time coverage of each company\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
|
|
||||||
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
|
|
||||||
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
|
|
||||||
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
|
|
||||||
" return datecover\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def compute_time_intersection(datecover):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" This function returns the time coverage for all companies\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
|
|
||||||
" intersection = set.intersection(*timestamps_sets)\n",
|
|
||||||
" intersection_list = list(intersection)\n",
|
|
||||||
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
|
|
||||||
" return sorted(formated_dates)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def df_coverage_modelization(sport, coverage_train = 0.7):\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" This function returns start_date, end_of_features and final dates\n",
|
|
||||||
" that help to construct train and test datasets\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" datecover = {}\n",
|
|
||||||
" for company in sport:\n",
|
|
||||||
" df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n",
|
|
||||||
" datetime_col = ['purchase_date'])\n",
|
|
||||||
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
|
|
||||||
" #print(datecover.keys())\n",
|
|
||||||
" dt_coverage = compute_time_intersection(datecover)\n",
|
|
||||||
" start_date = dt_coverage[0]\n",
|
|
||||||
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
|
|
||||||
" final_date = dt_coverage[-1]\n",
|
|
||||||
" return start_date, end_of_features, final_date\n",
|
|
||||||
" \n",
|
|
||||||
"\n",
|
|
||||||
"def dataset_construction(min_date, end_features_date, max_date, directory_path):\n",
|
|
||||||
" \n",
|
|
||||||
" # Import customerplus\n",
|
|
||||||
" df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
|
|
||||||
" df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
|
|
||||||
" df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
|
|
||||||
" \n",
|
|
||||||
" # Filtre de cohérence pour la mise en pratique de notre méthode\n",
|
|
||||||
" max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n",
|
|
||||||
" end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n",
|
|
||||||
" min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n",
|
|
||||||
"\n",
|
|
||||||
" #Filtre de la base df_campaigns_information\n",
|
|
||||||
" df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
|
|
||||||
" df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
|
|
||||||
" \n",
|
|
||||||
" #Filtre de la base df_products_purchased_reduced\n",
|
|
||||||
" df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
|
|
||||||
"\n",
|
|
||||||
" print(\"Data filtering : SUCCESS\")\n",
|
|
||||||
" \n",
|
|
||||||
" # Fusion de l'ensemble et creation des KPI\n",
|
|
||||||
"\n",
|
|
||||||
" # KPI sur les campagnes publicitaires\n",
|
|
||||||
" df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
|
|
||||||
"\n",
|
|
||||||
" # KPI sur le comportement d'achat\n",
|
|
||||||
" df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
|
|
||||||
"\n",
|
|
||||||
" # KPI sur les données socio-démographiques\n",
|
|
||||||
" df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n",
|
|
||||||
" \n",
|
|
||||||
" print(\"KPIs construction : SUCCESS\")\n",
|
|
||||||
" \n",
|
|
||||||
" # Fusion avec KPI liés au customer\n",
|
|
||||||
" df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n",
|
|
||||||
" \n",
|
|
||||||
" # Fill NaN values\n",
|
|
||||||
" df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n",
|
|
||||||
" \n",
|
|
||||||
" # Fusion avec KPI liés au comportement d'achat\n",
|
|
||||||
" df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n",
|
|
||||||
" \n",
|
|
||||||
" # Fill NaN values\n",
|
|
||||||
" df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n",
|
|
||||||
"\n",
|
|
||||||
" print(\"Explanatory variable construction : SUCCESS\")\n",
|
|
||||||
"\n",
|
|
||||||
" # 2. Construction of the explained variable \n",
|
|
||||||
" df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n",
|
|
||||||
"\n",
|
|
||||||
" # Indicatrice d'achat\n",
|
|
||||||
" df_products_purchased_to_predict['y_has_purchased'] = 1\n",
|
|
||||||
"\n",
|
|
||||||
" y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n",
|
|
||||||
"\n",
|
|
||||||
" print(\"Explained variable construction : SUCCESS\")\n",
|
|
||||||
" \n",
|
|
||||||
" # 3. Merge between explained and explanatory variables\n",
|
|
||||||
" dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n",
|
|
||||||
"\n",
|
|
||||||
" # 0 if there is no purchase\n",
|
|
||||||
" dataset[['y_has_purchased']].fillna(0)\n",
|
|
||||||
"\n",
|
|
||||||
" # add id_company prefix to customer_id\n",
|
|
||||||
" dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')\n",
|
|
||||||
" \n",
|
|
||||||
" return dataset\n",
|
|
||||||
"\n",
|
|
||||||
"## Exportation\n",
|
|
||||||
"\n",
|
|
||||||
"companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
|
|
||||||
" 'sport': ['5', '6', '7', '8', '9'],\n",
|
|
||||||
" 'musique' : ['10', '11', '12', '13', '14']}\n",
|
|
||||||
"\n",
|
|
||||||
"type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n",
|
|
||||||
"list_of_comp = companies[type_of_comp] \n",
|
|
||||||
"# Dossier d'exportation\n",
|
|
||||||
"BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'\n",
|
|
||||||
"\n",
|
|
||||||
"# Create test dataset and train dataset for sport companies\n",
|
|
||||||
"\n",
|
|
||||||
"start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n",
|
|
||||||
"\n",
|
|
||||||
"for company in list_of_comp:\n",
|
|
||||||
" dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
|
|
||||||
" max_date = final_date, directory_path = company) \n",
|
|
||||||
"\n",
|
|
||||||
" # Exportation\n",
|
|
||||||
" FILE_KEY_OUT_S3 = \"dataset_test\" + company + \".csv\"\n",
|
|
||||||
" FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Test_set/\" + FILE_KEY_OUT_S3\n",
|
|
||||||
" \n",
|
|
||||||
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
|
|
||||||
" dataset_test.to_csv(file_out, index = False)\n",
|
|
||||||
" \n",
|
|
||||||
" print(\"Exportation dataset test : SUCCESS\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Dataset train\n",
|
|
||||||
" dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
|
|
||||||
" max_date = final_date, directory_path = company)\n",
|
|
||||||
" # Export\n",
|
|
||||||
" FILE_KEY_OUT_S3 = \"dataset_train\" + company + \".csv\" \n",
|
|
||||||
" FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Train_test/\" + FILE_KEY_OUT_S3\n",
|
|
||||||
" \n",
|
|
||||||
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
|
|
||||||
" dataset_train.to_csv(file_out, index = False)\n",
|
|
||||||
" \n",
|
|
||||||
" print(\"Exportation dataset train : SUCCESS\")\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"print(\"FIN DE LA GENERATION DES DATASETS : SUCCESS\")\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "3721427e-5957-4556-b278-2e7ffca892f4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'projet-bdc2324-team1/Generalization/musique/Train_test/dataset_train14.csv'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"FILE_PATH_OUT_S3"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"id": "f8546992-f425-4d1e-ad75-ad26a8052a18",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'projet' is not defined",
|
|
||||||
"output_type": "error",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprojet\u001b[49m\u001b[38;5;241m-\u001b[39mbdc2324\u001b[38;5;241m-\u001b[39mteam1\u001b[38;5;241m/\u001b[39mGeneralization\u001b[38;5;241m/\u001b[39mmusique\u001b[38;5;241m/\u001b[39mTrain_test\n",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m: name 'projet' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"projet-bdc2324-team1/Generalization/musique/Train_test"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "0dd34710-6da2-4438-9e1d-0ac092c1d28c",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"(343126, 41)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"dataset_train.shape"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"id": "a3bfeeb6-2db0-4f1d-866c-8721343e97c5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"customer_id 0.000000\n",
|
|
||||||
"nb_tickets 0.000000\n",
|
|
||||||
"nb_purchases 0.000000\n",
|
|
||||||
"total_amount 0.000000\n",
|
|
||||||
"nb_suppliers 0.000000\n",
|
|
||||||
"vente_internet_max 0.000000\n",
|
|
||||||
"purchase_date_min 0.858950\n",
|
|
||||||
"purchase_date_max 0.858950\n",
|
|
||||||
"time_between_purchase 0.858950\n",
|
|
||||||
"nb_tickets_internet 0.000000\n",
|
|
||||||
"street_id 0.000000\n",
|
|
||||||
"structure_id 0.869838\n",
|
|
||||||
"mcp_contact_id 0.276677\n",
|
|
||||||
"fidelity 0.000000\n",
|
|
||||||
"tenant_id 0.000000\n",
|
|
||||||
"is_partner 0.000000\n",
|
|
||||||
"deleted_at 1.000000\n",
|
|
||||||
"gender 0.000000\n",
|
|
||||||
"is_email_true 0.000000\n",
|
|
||||||
"opt_in 0.000000\n",
|
|
||||||
"last_buying_date 0.709626\n",
|
|
||||||
"max_price 0.709626\n",
|
|
||||||
"ticket_sum 0.000000\n",
|
|
||||||
"average_price 0.709626\n",
|
|
||||||
"average_purchase_delay 0.709731\n",
|
|
||||||
"average_price_basket 0.709731\n",
|
|
||||||
"average_ticket_basket 0.709731\n",
|
|
||||||
"total_price 0.000000\n",
|
|
||||||
"purchase_count 0.000000\n",
|
|
||||||
"first_buying_date 0.709626\n",
|
|
||||||
"country 0.152090\n",
|
|
||||||
"gender_label 0.000000\n",
|
|
||||||
"gender_female 0.000000\n",
|
|
||||||
"gender_male 0.000000\n",
|
|
||||||
"gender_other 0.000000\n",
|
|
||||||
"country_fr 0.152090\n",
|
|
||||||
"has_tags 0.000000\n",
|
|
||||||
"nb_campaigns 0.000000\n",
|
|
||||||
"nb_campaigns_opened 0.000000\n",
|
|
||||||
"time_to_open 0.848079\n",
|
|
||||||
"y_has_purchased 1.000000\n",
|
|
||||||
"dtype: float64"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
" dataset_train.isna().sum()/dataset_train.shape[0]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "75f9a672-641f-49a2-a8d6-7673845506f5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#Creation de la variable dependante fictive: 1 si l'individu a effectué un achat au cours de la periode de train et 0 sinon\n",
|
|
||||||
"\n",
|
|
||||||
"dataset_train_modif=dataset_train\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "c121c1e2-d8e4-4b93-a882-9385581b63c9",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"dataset_train_modif[\""
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.6"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user