Suppression des notebooks exploratoires et brouillons

This commit is contained in:
Antoine JOUBREL 2024-04-09 20:20:57 +00:00
parent 9ca22fb9e7
commit 4ed6bd809d
30 changed files with 0 additions and 88609 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,68 +0,0 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import warnings
# Ignore warning
warnings.filterwarnings('ignore')
exec(open('../0_KPI_functions.py').read())
exec(open('plot.py').read())
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5'],
'musique' : ['10', '11', '12', '13', '14']}
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
list_of_comp = companies[type_of_activity]
# Load files
customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
# Identify anonymous customer for each company and remove them from our datasets
outlier_list = outlier_detection(tickets, list_of_comp)
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
customer_valid_list = valid_customer_detection(products, campaigns_brut)
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
for dataset in databases:
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
#print(f'shape of {dataset} : ', dataset.shape)
# Identify customer who bought during the period of y
customer_target_period = identify_purchase_during_target_periode(products)
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
# Generate graph and automatically saved them in the bucket
compute_nb_clients(customer, type_of_activity)
maximum_price_paid(customer, type_of_activity)
mailing_consent(customer, type_of_activity)
mailing_consent_by_target(customer)
gender_bar(customer, type_of_activity)
country_bar(customer, type_of_activity)
lazy_customer_plot(campaigns_kpi, type_of_activity)
#campaigns_effectiveness(customer, type_of_activity)
sale_dynamics(products, campaigns_brut, type_of_activity)
tickets_internet(tickets, type_of_activity)
box_plot_price_tickets(tickets, type_of_activity)

View File

@ -1,328 +0,0 @@
import pandas as pd
import os
import s3fs
import io
import warnings
from datetime import date, timedelta, datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
def load_files(nb_compagnie):
customer = pd.DataFrame()
campaigns_brut = pd.DataFrame()
campaigns_kpi = pd.DataFrame()
products = pd.DataFrame()
tickets = pd.DataFrame()
# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
for directory_path in nb_compagnie:
df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_databases(directory_path, file_name = "target_information")
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut)
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
# creation de la colonne Number compagnie, qui permettra d'agréger les résultats
df_tickets_kpi["number_company"]=int(directory_path)
df_campaigns_brut["number_company"]=int(directory_path)
df_campaigns_kpi["number_company"]=int(directory_path)
df_customerplus_clean["number_company"]=int(directory_path)
df_target_information["number_company"]=int(directory_path)
# Traitement des index
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
# Concaténation
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
return customer, campaigns_kpi, campaigns_brut, tickets, products
def save_file_s3(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def outlier_detection(tickets, company_list, show_diagram=False):
outlier_list = list()
for company in company_list:
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
total_amount_share_index = total_amount_share.set_index('customer_id')
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
#print('df circulaire : ', df_circulaire.head())
top = df_circulaire[:1]
#print('top : ', top)
outlier_list.append(top.index[0])
rest = df_circulaire[1:]
rest_sum = rest.sum()
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
if show_diagram:
plt.figure(figsize=(3, 3))
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
plt.axis('equal')
plt.title(f'Répartition des montants totaux pour la compagnie {company}')
plt.show()
return outlier_list
def valid_customer_detection(products, campaigns_brut):
products_valid = products[products['purchase_date']>="2021-05-01"]
consumer_valid_product = products_valid['customer_id'].to_list()
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
consumer_valid = consumer_valid_product + consumer_valid_campaigns
return consumer_valid
def identify_purchase_during_target_periode(products):
products_target_period = products[(products['purchase_date']>="2022-11-01")
& (products['purchase_date']<="2023-11-01")]
customer_target_period = products_target_period['customer_id'].to_list()
return customer_target_period
def remove_elements(lst, elements_to_remove):
return ''.join([x for x in lst if x not in elements_to_remove])
def compute_nb_clients(customer, type_of_activity):
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
plt.xlabel('Company')
plt.ylabel("Number of clients (thousands)")
plt.title(f"Number of clients for {type_of_activity}")
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
plt.show()
save_file_s3("nb_clients_", type_of_activity)
def maximum_price_paid(customer, type_of_activity):
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
plt.xlabel('Company')
plt.ylabel("Maximal price of a ticket Prix")
plt.title(f"Maximal price of a ticket for {type_of_activity}")
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
plt.show()
save_file_s3("Maximal_price_", type_of_activity)
def mailing_consent(customer, type_of_activity):
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
plt.xlabel('Company')
plt.ylabel('Consent')
plt.title(f'Consent of mailing for {type_of_activity}')
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
plt.show()
save_file_s3("mailing_consent_", type_of_activity)
def mailing_consent_by_target(customer):
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
# Création du barplot groupé
fig, ax = plt.subplots(figsize=(10, 6))
categories = df_graph["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
# Grouper les données par label et créer les barres groupées
for label in df_graph["has_purchased_target_period"].unique():
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
label_printed = "purchased" if label else "no purchase"
ax.bar(bar_positions, values, bar_width, label=label_printed)
# Mise à jour des positions des barres pour le prochain groupe
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
ax.set_xlabel('Company')
ax.set_ylabel('Consent')
ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
# Affichage du plot
plt.show()
save_file_s3("mailing_consent_target_", type_of_activity)
def gender_bar(customer, type_of_activity):
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
# Création du barplot
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
plt.bar(company_genders["number_company"], company_genders["gender_female"],
bottom = company_genders["gender_male"], label = "Femme")
plt.bar(company_genders["number_company"], company_genders["gender_other"],
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
plt.xlabel('Company')
plt.ylabel("Gender")
plt.title(f"Gender of Customer for {type_of_activity}")
plt.legend()
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
plt.show()
save_file_s3("gender_bar_", type_of_activity)
def country_bar(customer, type_of_activity):
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
plt.xlabel('Company')
plt.ylabel("Share of French Customer")
plt.title(f"Share of French Customer for {type_of_activity}")
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
plt.show()
save_file_s3("country_bar_", type_of_activity)
def lazy_customer_plot(campaigns_kpi, type_of_activity):
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
plt.xlabel('Company')
plt.ylabel("Share of Customers who did not open mail")
plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
plt.show()
save_file_s3("lazy_customer_", type_of_activity)
def campaigns_effectiveness(customer, type_of_activity):
campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index()
plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"])
plt.xlabel('Company')
plt.ylabel("Number of Customers (thousands)")
plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
plt.legend()
plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
plt.show()
save_file_s3("campaigns_effectiveness_", type_of_activity)
def sale_dynamics(products, campaigns_brut, type_of_activity):
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
# Mois du premier mails
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
# Fusion
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
# Mois à partir duquel le client est considere comme connu
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
# Nombre de commande par mois
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
# Nombre de commande par mois par type de client
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
# Graphique en nombre de commande
purchases_graph = nb_purchases_graph
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
# commande pr afficher slt
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
plt.xlabel('Month')
plt.ylabel("Number of Sales")
plt.title(f"Number of Sales for {type_of_activity}")
plt.legend()
plt.show()
save_file_s3("sale_dynamics_", type_of_activity)
def tickets_internet(tickets, type_of_activity):
nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["Share_ticket_internet"])
plt.xlabel('Company')
plt.ylabel("Share of Tickets Bought Online")
plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
plt.show()
save_file_s3("tickets_internet_", type_of_activity)
def box_plot_price_tickets(tickets, type_of_activity):
price_tickets = tickets[(tickets['total_amount'] > 0)]
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
plt.title(f"Box plot of price tickets for {type_of_activity}")
plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
plt.show()
save_file_s3("box_plot_price_tickets_", type_of_activity)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,436 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "135a67de-cff8-4345-bacc-d9f9fa68a41f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
"from sklearn.utils import class_weight\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
"\n",
"import statsmodels.api as sm\n",
"\n",
"import pickle\n",
"import warnings"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "9a6254df-d496-4957-89ea-9ed2b74049dd",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "922cf05f-8343-4ed0-ad62-3ef1f17c0730",
"metadata": {},
"outputs": [],
"source": [
"def load_train_test():\n",
" BUCKET = \"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/musee\"\n",
" File_path_train = BUCKET + \"/Train_set.csv\"\n",
" File_path_test = BUCKET + \"/Test_set.csv\"\n",
" \n",
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
"\n",
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
" \n",
" return dataset_train, dataset_test\n",
"\n",
"\n",
"def features_target_split(dataset_train, dataset_test):\n",
" features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',\n",
" 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',\n",
" 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',\n",
" 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',\n",
" 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',\n",
" 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',\n",
" 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',\n",
" 'target_jeune', 'target_abonne']\n",
" X_train = dataset_train[features_l]\n",
" y_train = dataset_train[['y_has_purchased']]\n",
"\n",
" X_test = dataset_test[features_l]\n",
" y_test = dataset_test[['y_has_purchased']]\n",
" return X_train, X_test, y_train, y_test"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2584e454-111b-4c39-881b-676841cb5aa1",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_498/3950829189.py:7: DtypeWarning: Columns (10,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
"/tmp/ipykernel_498/3950829189.py:11: DtypeWarning: Columns (10,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
]
}
],
"source": [
"dataset_train, dataset_test = load_train_test()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a32ea7f8-e2d3-44db-8937-5afda9447b58",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "3bdc8840-7f45-416f-8ee0-307db201c496",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"const 0\n",
"nb_campaigns 0\n",
"taux_ouverture_mail 0\n",
"prop_purchases_internet 0\n",
"nb_tickets 0\n",
"nb_purchases 0\n",
"total_amount 0\n",
"nb_suppliers 0\n",
"time_to_open 0\n",
"purchases_10_2021 0\n",
"purchases_10_2022 0\n",
"purchases_11_2021 0\n",
"purchases_12_2021 0\n",
"purchases_1_2022 0\n",
"purchases_2_2022 0\n",
"purchases_3_2022 0\n",
"purchases_4_2022 0\n",
"purchases_5_2021 0\n",
"purchases_5_2022 0\n",
"purchases_6_2021 0\n",
"purchases_6_2022 0\n",
"purchases_7_2021 0\n",
"purchases_7_2022 0\n",
"purchases_8_2021 0\n",
"purchases_8_2022 0\n",
"purchases_9_2021 0\n",
"purchases_9_2022 0\n",
"purchase_date_min 0\n",
"purchase_date_max 0\n",
"nb_targets 0\n",
"gender_female 0\n",
"gender_male 0\n",
"achat_internet 0\n",
"categorie_age_0_10 0\n",
"categorie_age_10_20 0\n",
"categorie_age_20_30 0\n",
"categorie_age_30_40 0\n",
"categorie_age_40_50 0\n",
"categorie_age_50_60 0\n",
"categorie_age_60_70 0\n",
"categorie_age_70_80 0\n",
"categorie_age_plus_80 0\n",
"categorie_age_inconnue 0\n",
"country_fr 0\n",
"is_profession_known 0\n",
"is_zipcode_known 0\n",
"opt_in 0\n",
"target_optin 0\n",
"target_newsletter 0\n",
"target_scolaire 0\n",
"target_entreprise 0\n",
"target_famille 0\n",
"target_jeune 0\n",
"target_abonne 0\n",
"dtype: int64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "3c3ac545-52e0-4d0c-afdc-fff70f468a94",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"most_frequent_value = X_train['country_fr'].mode()[0]\n",
"most_frequent_value"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "0fcdc5ee-bcea-4436-be9b-92b79d27a230",
"metadata": {},
"outputs": [],
"source": [
"X_train['country_fr'] = X_train['country_fr'].fillna(most_frequent_value)\n",
"X_train['time_to_open'] = X_train['time_to_open'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7ecdaf1a-b5e4-4880-871e-363eae6fe4e1",
"metadata": {},
"outputs": [],
"source": [
"weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n",
" y = y_train['y_has_purchased'])\n",
"\n",
"weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a6b56090-cfe9-4772-810c-d36bf12aceca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.52239696, 0.52239696, 0.52239696, ..., 0.52239696, 0.52239696,\n",
" 0.52239696])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"class_counts = np.bincount(y_train['y_has_purchased'])\n",
"class_weights = len(y_train['y_has_purchased']) / (2 * class_counts)\n",
"\n",
"weights = class_weights[y_train['y_has_purchased'].values.astype(int)]\n",
"weights"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "bfaea23e-7d7a-4c0d-96f6-4ab4c7c2ff51",
"metadata": {},
"outputs": [],
"source": [
"X_train = sm.add_constant(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "4cf97ae5-9dcf-4f4c-91b3-3b1f339a6213",
"metadata": {},
"outputs": [],
"source": [
"numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
" 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',\n",
" 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',\n",
" 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "debb36df-3c2f-4cf7-83a9-ad6e4f6b0470",
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()\n",
"\n",
"X_train_scaled_columns = scaler.fit_transform(X_train[numeric_features])\n",
"\n",
"X_train_scaled = X_train.copy() #\n",
"X_train_scaled[numeric_features] = X_train_scaled_columns"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "7eaa6160-20a0-4a78-ac38-0411e19707ed",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/mamba/lib/python3.11/site-packages/statsmodels/base/optimizer.py:18: FutureWarning: Keyword arguments have been passed to the optimizer that have no effect. The list of allowed keyword arguments for method newton is: tol, ridge_factor. The list of unsupported keyword arguments passed include: weights. After release 0.14, this will raise.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully.\n",
" Current function value: 0.136180\n",
" Iterations 9\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y_has_purchased No. Observations: 434278\n",
"Model: Logit Df Residuals: 434226\n",
"Method: MLE Df Model: 51\n",
"Date: Thu, 04 Apr 2024 Pseudo R-squ.: 0.2305\n",
"Time: 06:09:09 Log-Likelihood: -59140.\n",
"converged: True LL-Null: -76855.\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"===========================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"-------------------------------------------------------------------------------------------\n",
"const -4.0679 1.65e+06 -2.46e-06 1.000 -3.24e+06 3.24e+06\n",
"nb_campaigns 0.0916 0.012 7.352 0.000 0.067 0.116\n",
"taux_ouverture_mail 0.0012 0.011 0.106 0.916 -0.021 0.023\n",
"prop_purchases_internet -0.1995 0.067 -2.972 0.003 -0.331 -0.068\n",
"nb_tickets 0.5956 0.193 3.091 0.002 0.218 0.973\n",
"nb_purchases 0.1598 1.71e+06 9.37e-08 1.000 -3.34e+06 3.34e+06\n",
"total_amount -0.1938 0.071 -2.724 0.006 -0.333 -0.054\n",
"nb_suppliers 0.0282 0.021 1.348 0.178 -0.013 0.069\n",
"time_to_open 0.2785 0.018 15.534 0.000 0.243 0.314\n",
"purchases_10_2021 0.0417 4.76e+04 8.76e-07 1.000 -9.34e+04 9.34e+04\n",
"purchases_10_2022 0.4578 2.72e+05 1.68e-06 1.000 -5.33e+05 5.33e+05\n",
"purchases_11_2021 0.0252 4.92e+04 5.12e-07 1.000 -9.65e+04 9.65e+04\n",
"purchases_12_2021 0.0221 6.3e+04 3.5e-07 1.000 -1.24e+05 1.24e+05\n",
"purchases_1_2022 0.0083 5.49e+04 1.52e-07 1.000 -1.08e+05 1.08e+05\n",
"purchases_2_2022 0.0462 7.59e+04 6.09e-07 1.000 -1.49e+05 1.49e+05\n",
"purchases_3_2022 0.0928 1.07e+05 8.67e-07 1.000 -2.1e+05 2.1e+05\n",
"purchases_4_2022 0.1446 1.65e+05 8.75e-07 1.000 -3.24e+05 3.24e+05\n",
"purchases_5_2021 -0.0427 4.84e+04 -8.83e-07 1.000 -9.48e+04 9.48e+04\n",
"purchases_5_2022 0.1412 1.67e+05 8.46e-07 1.000 -3.27e+05 3.27e+05\n",
"purchases_6_2021 -0.0252 5.55e+04 -4.54e-07 1.000 -1.09e+05 1.09e+05\n",
"purchases_6_2022 0.1246 1.84e+05 6.77e-07 1.000 -3.6e+05 3.6e+05\n",
"purchases_7_2021 -0.0252 5.55e+04 -4.55e-07 1.000 -1.09e+05 1.09e+05\n",
"purchases_7_2022 -0.0074 2.1e+05 -3.54e-08 1.000 -4.12e+05 4.12e+05\n",
"purchases_8_2021 0.0116 5.26e+04 2.21e-07 1.000 -1.03e+05 1.03e+05\n",
"purchases_8_2022 0.0554 2.4e+05 2.31e-07 1.000 -4.7e+05 4.7e+05\n",
"purchases_9_2021 -0.0320 5.47e+04 -5.85e-07 1.000 -1.07e+05 1.07e+05\n",
"purchases_9_2022 0.2349 2.2e+05 1.07e-06 1.000 -4.32e+05 4.32e+05\n",
"purchase_date_min 0.0781 0.025 3.092 0.002 0.029 0.128\n",
"purchase_date_max -0.5228 0.026 -20.021 0.000 -0.574 -0.472\n",
"nb_targets 0.7083 0.010 74.555 0.000 0.690 0.727\n",
"gender_female 0.2961 0.038 7.701 0.000 0.221 0.371\n",
"gender_male 0.0450 0.040 1.137 0.256 -0.033 0.123\n",
"achat_internet 0.1869 0.158 1.186 0.236 -0.122 0.496\n",
"categorie_age_0_10 -0.2713 1.65e+06 -1.64e-07 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_10_20 -0.1238 1.65e+06 -7.48e-08 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_20_30 -0.6322 1.65e+06 -3.82e-07 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_30_40 -0.5004 1.65e+06 -3.02e-07 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_40_50 -0.4020 1.65e+06 -2.43e-07 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_50_60 -0.4101 1.65e+06 -2.48e-07 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_60_70 -0.3232 1.65e+06 -1.95e-07 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_70_80 -0.1635 1.65e+06 -9.88e-08 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_plus_80 -0.4677 1.65e+06 -2.83e-07 1.000 -3.24e+06 3.24e+06\n",
"categorie_age_inconnue -0.7737 1.65e+06 -4.68e-07 1.000 -3.24e+06 3.24e+06\n",
"country_fr 0.7419 0.065 11.422 0.000 0.615 0.869\n",
"is_profession_known -0.5947 0.066 -9.074 0.000 -0.723 -0.466\n",
"is_zipcode_known 1.1374 0.027 41.609 0.000 1.084 1.191\n",
"opt_in -1.0658 0.030 -35.485 0.000 -1.125 -1.007\n",
"target_optin 0.5946 0.034 17.361 0.000 0.527 0.662\n",
"target_newsletter -1.0237 0.035 -29.411 0.000 -1.092 -0.955\n",
"target_scolaire 0.0428 0.036 1.188 0.235 -0.028 0.113\n",
"target_entreprise -0.2645 0.058 -4.589 0.000 -0.377 -0.152\n",
"target_famille 0.5035 0.035 14.548 0.000 0.436 0.571\n",
"target_jeune -0.6795 0.029 -23.590 0.000 -0.736 -0.623\n",
"target_abonne 0.0677 0.037 1.833 0.067 -0.005 0.140\n",
"===========================================================================================\n"
]
}
],
"source": [
"model_logit = sm.Logit(y_train, X_train_scaled)\n",
"\n",
"result = model_logit.fit(weights=weights)\n",
"\n",
"print(result.summary())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75dc92c7-cc1e-40f1-bc74-0b04043b7e44",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,825 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0b029d42-fb02-481e-a407-7e41886198a6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/1',\n",
" 'bdc2324-data/10',\n",
" 'bdc2324-data/101',\n",
" 'bdc2324-data/11',\n",
" 'bdc2324-data/12',\n",
" 'bdc2324-data/13',\n",
" 'bdc2324-data/14',\n",
" 'bdc2324-data/2',\n",
" 'bdc2324-data/3',\n",
" 'bdc2324-data/4',\n",
" 'bdc2324-data/5',\n",
" 'bdc2324-data/6',\n",
" 'bdc2324-data/7',\n",
" 'bdc2324-data/8',\n",
" 'bdc2324-data/9']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"bdc2324-data\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b",
"metadata": {},
"outputs": [],
"source": [
"# Chargement des fichiers campaign_stats.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e",
"metadata": {},
"outputs": [],
"source": [
"# Conversion des dates 'sent_at'\n",
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2023-11-09 18:10:45+00:00\n",
"2020-06-02 08:24:08+00:00\n",
"2023-10-12 01:39:48+00:00\n",
"2023-10-10 17:06:29+00:00\n",
"2023-11-01 09:20:48+00:00\n",
"2021-03-31 14:59:02+00:00\n"
]
}
],
"source": [
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
"print(campaign_stats_1['sent_at'].max())\n",
"print(campaign_stats_1['sent_at'].min())\n",
"\n",
"print(campaign_stats_2['sent_at'].max())\n",
"print(campaign_stats_2['sent_at'].min())\n",
"\n",
"print(campaign_stats_3['sent_at'].max())\n",
"print(campaign_stats_3['sent_at'].min())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c75632df-b018-4bb8-a99d-83f15af94369",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 2021-03-28 16:01:09+00:00\n",
"1 2021-03-28 16:01:09+00:00\n",
"2 2021-03-28 16:00:59+00:00\n",
"3 2021-03-28 16:00:59+00:00\n",
"4 2021-03-28 16:01:06+00:00\n",
" ... \n",
"6214803 2023-10-23 09:32:33+00:00\n",
"6214804 2023-10-23 09:32:49+00:00\n",
"6214805 2023-10-23 09:33:28+00:00\n",
"6214806 2023-10-23 09:31:53+00:00\n",
"6214807 2023-10-23 09:33:54+00:00\n",
"Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"campaign_stats_1['sent_at']"
]
},
{
"cell_type": "markdown",
"id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22",
"metadata": {},
"source": [
"### Customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d3bf880d-1065-4d5b-9954-1830aa5081af",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
]
}
],
"source": [
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7368f381-db8e-4a4d-9fe2-5947eb55be58",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
" 'average_purchase_delay', 'average_price_basket',\n",
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
" 'tenant_id'],\n",
" dtype='object')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers_plus_1.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08091935-b159-47fa-806c-e1444f3b227e",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_1.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f8c8868-c1ac-4cee-af08-533d928f6764",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_1['id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf95daf2-4852-4718-b474-207a1ebd8ac4",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_2['id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1425c385-3216-4e4f-ae8f-a121624721ba",
"metadata": {},
"outputs": [],
"source": [
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "92533026-e27c-4f1f-81ca-64eda32a34c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
"# Exemple id commun = caractéristiques communes\n",
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
"\n",
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id 0.000000\n",
"lastname 43.461341\n",
"firstname 44.995588\n",
"birthdate 96.419870\n",
"email 8.622075\n",
"street_id 0.000000\n",
"created_at 0.000000\n",
"updated_at 0.000000\n",
"civility 100.000000\n",
"is_partner 0.000000\n",
"extra 100.000000\n",
"deleted_at 100.000000\n",
"reference 100.000000\n",
"gender 0.000000\n",
"is_email_true 0.000000\n",
"extra_field 100.000000\n",
"identifier 0.000000\n",
"opt_in 0.000000\n",
"structure_id 88.072380\n",
"note 99.403421\n",
"profession 95.913503\n",
"language 99.280945\n",
"mcp_contact_id 34.876141\n",
"need_reload 0.000000\n",
"last_buying_date 51.653431\n",
"max_price 51.653431\n",
"ticket_sum 0.000000\n",
"average_price 8.639195\n",
"fidelity 0.000000\n",
"average_purchase_delay 51.653431\n",
"average_price_basket 51.653431\n",
"average_ticket_basket 51.653431\n",
"total_price 43.014236\n",
"preferred_category 100.000000\n",
"preferred_supplier 100.000000\n",
"preferred_formula 100.000000\n",
"purchase_count 0.000000\n",
"first_buying_date 51.653431\n",
"last_visiting_date 100.000000\n",
"zipcode 71.176564\n",
"country 5.459418\n",
"age 96.419870\n",
"tenant_id 0.000000\n",
"dtype: float64\n"
]
}
],
"source": [
"pd.DataFrame(customers_plus_1.isna().mean()*100)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "6d62e73f-3925-490f-9fd4-d0e838903cb2",
"metadata": {},
"outputs": [],
"source": [
"# Chargement de toutes les données\n",
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
"\n",
"for nom_base in liste_base:\n",
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>...</th>\n",
" <th>tenant_id</th>\n",
" <th>id_x</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of</th>\n",
" <th>is_from_subscription</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>start_date_time</th>\n",
" <th>event_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>405082</td>\n",
" <td>lastname405082</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>992423</td>\n",
" <td>405082</td>\n",
" <td>2023-01-11 17:08:41+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>13.0</td>\n",
" <td>False</td>\n",
" <td>2023-02-06 20:00:00+01:00</td>\n",
" <td>zaide</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>405082</td>\n",
" <td>lastname405082</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>992423</td>\n",
" <td>405082</td>\n",
" <td>2023-01-11 17:08:41+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>13.0</td>\n",
" <td>False</td>\n",
" <td>2023-02-06 20:00:00+01:00</td>\n",
" <td>zaide</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>411168</td>\n",
" <td>lastname411168</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1053934</td>\n",
" <td>411168</td>\n",
" <td>2023-03-16 16:23:10+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>62.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-19 16:00:00+01:00</td>\n",
" <td>luisa miller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>411168</td>\n",
" <td>lastname411168</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1053934</td>\n",
" <td>411168</td>\n",
" <td>2023-03-16 16:23:10+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>62.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-19 16:00:00+01:00</td>\n",
" <td>luisa miller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4380</td>\n",
" <td>lastname4380</td>\n",
" <td>firstname4380</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>2021-04-22 14:51:55.432952+02:00</td>\n",
" <td>2022-04-14 11:41:33.738500+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1189141</td>\n",
" <td>4380</td>\n",
" <td>2020-11-26 13:12:53+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>51.3</td>\n",
" <td>False</td>\n",
" <td>2020-12-01 20:00:00+01:00</td>\n",
" <td>iphigenie en tauride</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1090839</td>\n",
" <td>19095</td>\n",
" <td>2019-05-19 21:18:36+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.5</td>\n",
" <td>False</td>\n",
" <td>2019-05-27 20:00:00+02:00</td>\n",
" <td>entre femmes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1090839</td>\n",
" <td>19095</td>\n",
" <td>2019-05-19 21:18:36+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.5</td>\n",
" <td>False</td>\n",
" <td>2019-05-27 20:00:00+02:00</td>\n",
" <td>entre femmes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1090839</td>\n",
" <td>19095</td>\n",
" <td>2019-05-19 21:18:36+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.5</td>\n",
" <td>False</td>\n",
" <td>2019-05-27 20:00:00+02:00</td>\n",
" <td>entre femmes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1244277</td>\n",
" <td>19095</td>\n",
" <td>2019-12-31 11:04:07+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>5.5</td>\n",
" <td>False</td>\n",
" <td>2020-02-03 20:00:00+01:00</td>\n",
" <td>a boire et a manger</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1244277</td>\n",
" <td>19095</td>\n",
" <td>2019-12-31 11:04:07+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>5.5</td>\n",
" <td>False</td>\n",
" <td>2020-02-03 20:00:00+01:00</td>\n",
" <td>a boire et a manger</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 52 columns</p>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email \\\n",
"0 405082 lastname405082 NaN NaN NaN \n",
"1 405082 lastname405082 NaN NaN NaN \n",
"2 411168 lastname411168 NaN NaN NaN \n",
"3 411168 lastname411168 NaN NaN NaN \n",
"4 4380 lastname4380 firstname4380 NaN NaN \n",
"... ... ... ... ... ... \n",
"318964 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"318965 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"318966 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"318967 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"318968 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"\n",
" street_id created_at \\\n",
"0 6 2023-01-12 06:30:31.197484+01:00 \n",
"1 6 2023-01-12 06:30:31.197484+01:00 \n",
"2 6 2023-03-17 06:30:35.431967+01:00 \n",
"3 6 2023-03-17 06:30:35.431967+01:00 \n",
"4 1 2021-04-22 14:51:55.432952+02:00 \n",
"... ... ... \n",
"318964 6 2021-04-22 15:06:30.120537+02:00 \n",
"318965 6 2021-04-22 15:06:30.120537+02:00 \n",
"318966 6 2021-04-22 15:06:30.120537+02:00 \n",
"318967 6 2021-04-22 15:06:30.120537+02:00 \n",
"318968 6 2021-04-22 15:06:30.120537+02:00 \n",
"\n",
" updated_at civility is_partner ... \\\n",
"0 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
"1 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
"2 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
"3 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
"4 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
"... ... ... ... ... \n",
"318964 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"318965 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"318966 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"318967 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"318968 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"\n",
" tenant_id id_x customer_id purchase_date type_of \\\n",
"0 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
"1 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
"2 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
"3 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
"4 1556 1189141 4380 2020-11-26 13:12:53+01:00 3 \n",
"... ... ... ... ... ... \n",
"318964 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
"318965 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
"318966 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
"318967 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
"318968 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
"\n",
" is_from_subscription amount is_full_price start_date_time \\\n",
"0 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
"1 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
"2 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
"3 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
"4 False 51.3 False 2020-12-01 20:00:00+01:00 \n",
"... ... ... ... ... \n",
"318964 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
"318965 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
"318966 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
"318967 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
"318968 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
"\n",
" event_name \n",
"0 zaide \n",
"1 zaide \n",
"2 luisa miller \n",
"3 luisa miller \n",
"4 iphigenie en tauride \n",
"... ... \n",
"318964 entre femmes \n",
"318965 entre femmes \n",
"318966 entre femmes \n",
"318967 a boire et a manger \n",
"318968 a boire et a manger \n",
"\n",
"[318969 rows x 52 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Jointure\n",
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
"df_customer_event"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,460 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bf34b03c-536f-4f93-93a5-e452552653aa",
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Choisissez le type de compagnie : sport ? musique ? musee ? musique\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
"Couverture Company 10 : 2016-03-07 - 2023-09-25\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
"Couverture Company 11 : 2015-06-26 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
"Couverture Company 12 : 2016-06-14 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
"Couverture Company 13 : 2010-07-31 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
"Couverture Company 14 : 1901-01-01 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"FIN DE LA GENERATION DES DATASETS : SUCCESS\n"
]
}
],
"source": [
"# Business Data Challenge - Team 1\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"import warnings\n",
"from datetime import date, timedelta, datetime\n",
"\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"\n",
"# Import KPI construction functions\n",
"exec(open('0_KPI_functions.py').read())\n",
"\n",
"# Ignore warning\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"\n",
"def display_covering_time(df, company, datecover):\n",
" \"\"\"\n",
" This function draws the time coverage of each company\n",
" \"\"\"\n",
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
" return datecover\n",
"\n",
"\n",
"def compute_time_intersection(datecover):\n",
" \"\"\"\n",
" This function returns the time coverage for all companies\n",
" \"\"\"\n",
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
" intersection = set.intersection(*timestamps_sets)\n",
" intersection_list = list(intersection)\n",
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
" return sorted(formated_dates)\n",
"\n",
"\n",
"def df_coverage_modelization(sport, coverage_train = 0.7):\n",
" \"\"\"\n",
" This function returns start_date, end_of_features and final dates\n",
" that help to construct train and test datasets\n",
" \"\"\"\n",
" datecover = {}\n",
" for company in sport:\n",
" df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n",
" datetime_col = ['purchase_date'])\n",
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
" #print(datecover.keys())\n",
" dt_coverage = compute_time_intersection(datecover)\n",
" start_date = dt_coverage[0]\n",
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
" final_date = dt_coverage[-1]\n",
" return start_date, end_of_features, final_date\n",
" \n",
"\n",
"def dataset_construction(min_date, end_features_date, max_date, directory_path):\n",
" \n",
" # Import customerplus\n",
" df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
" df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
" df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
" \n",
" # Filtre de cohérence pour la mise en pratique de notre méthode\n",
" max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n",
" end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n",
" min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n",
"\n",
" #Filtre de la base df_campaigns_information\n",
" df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
" df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
" \n",
" #Filtre de la base df_products_purchased_reduced\n",
" df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
"\n",
" print(\"Data filtering : SUCCESS\")\n",
" \n",
" # Fusion de l'ensemble et creation des KPI\n",
"\n",
" # KPI sur les campagnes publicitaires\n",
" df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
"\n",
" # KPI sur le comportement d'achat\n",
" df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
"\n",
" # KPI sur les données socio-démographiques\n",
" df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n",
" \n",
" print(\"KPIs construction : SUCCESS\")\n",
" \n",
" # Fusion avec KPI liés au customer\n",
" df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n",
" \n",
" # Fill NaN values\n",
" df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n",
" \n",
" # Fusion avec KPI liés au comportement d'achat\n",
" df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n",
" \n",
" # Fill NaN values\n",
" df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n",
"\n",
" print(\"Explanatory variable construction : SUCCESS\")\n",
"\n",
" # 2. Construction of the explained variable \n",
" df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n",
"\n",
" # Indicatrice d'achat\n",
" df_products_purchased_to_predict['y_has_purchased'] = 1\n",
"\n",
" y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n",
"\n",
" print(\"Explained variable construction : SUCCESS\")\n",
" \n",
" # 3. Merge between explained and explanatory variables\n",
" dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n",
"\n",
" # 0 if there is no purchase\n",
" dataset[['y_has_purchased']].fillna(0)\n",
"\n",
" # add id_company prefix to customer_id\n",
" dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')\n",
" \n",
" return dataset\n",
"\n",
"## Exportation\n",
"\n",
"companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
" 'sport': ['5', '6', '7', '8', '9'],\n",
" 'musique' : ['10', '11', '12', '13', '14']}\n",
"\n",
"type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n",
"list_of_comp = companies[type_of_comp] \n",
"# Dossier d'exportation\n",
"BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'\n",
"\n",
"# Create test dataset and train dataset for sport companies\n",
"\n",
"start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n",
"\n",
"for company in list_of_comp:\n",
" dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
" max_date = final_date, directory_path = company) \n",
"\n",
" # Exportation\n",
" FILE_KEY_OUT_S3 = \"dataset_test\" + company + \".csv\"\n",
" FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Test_set/\" + FILE_KEY_OUT_S3\n",
" \n",
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" dataset_test.to_csv(file_out, index = False)\n",
" \n",
" print(\"Exportation dataset test : SUCCESS\")\n",
"\n",
"# Dataset train\n",
" dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
" max_date = final_date, directory_path = company)\n",
" # Export\n",
" FILE_KEY_OUT_S3 = \"dataset_train\" + company + \".csv\" \n",
" FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Train_test/\" + FILE_KEY_OUT_S3\n",
" \n",
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" dataset_train.to_csv(file_out, index = False)\n",
" \n",
" print(\"Exportation dataset train : SUCCESS\")\n",
"\n",
"\n",
"print(\"FIN DE LA GENERATION DES DATASETS : SUCCESS\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3721427e-5957-4556-b278-2e7ffca892f4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'projet-bdc2324-team1/Generalization/musique/Train_test/dataset_train14.csv'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"FILE_PATH_OUT_S3"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "f8546992-f425-4d1e-ad75-ad26a8052a18",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'projet' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprojet\u001b[49m\u001b[38;5;241m-\u001b[39mbdc2324\u001b[38;5;241m-\u001b[39mteam1\u001b[38;5;241m/\u001b[39mGeneralization\u001b[38;5;241m/\u001b[39mmusique\u001b[38;5;241m/\u001b[39mTrain_test\n",
"\u001b[0;31mNameError\u001b[0m: name 'projet' is not defined"
]
}
],
"source": [
"projet-bdc2324-team1/Generalization/musique/Train_test"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "0dd34710-6da2-4438-9e1d-0ac092c1d28c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(343126, 41)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a3bfeeb6-2db0-4f1d-866c-8721343e97c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"customer_id 0.000000\n",
"nb_tickets 0.000000\n",
"nb_purchases 0.000000\n",
"total_amount 0.000000\n",
"nb_suppliers 0.000000\n",
"vente_internet_max 0.000000\n",
"purchase_date_min 0.858950\n",
"purchase_date_max 0.858950\n",
"time_between_purchase 0.858950\n",
"nb_tickets_internet 0.000000\n",
"street_id 0.000000\n",
"structure_id 0.869838\n",
"mcp_contact_id 0.276677\n",
"fidelity 0.000000\n",
"tenant_id 0.000000\n",
"is_partner 0.000000\n",
"deleted_at 1.000000\n",
"gender 0.000000\n",
"is_email_true 0.000000\n",
"opt_in 0.000000\n",
"last_buying_date 0.709626\n",
"max_price 0.709626\n",
"ticket_sum 0.000000\n",
"average_price 0.709626\n",
"average_purchase_delay 0.709731\n",
"average_price_basket 0.709731\n",
"average_ticket_basket 0.709731\n",
"total_price 0.000000\n",
"purchase_count 0.000000\n",
"first_buying_date 0.709626\n",
"country 0.152090\n",
"gender_label 0.000000\n",
"gender_female 0.000000\n",
"gender_male 0.000000\n",
"gender_other 0.000000\n",
"country_fr 0.152090\n",
"has_tags 0.000000\n",
"nb_campaigns 0.000000\n",
"nb_campaigns_opened 0.000000\n",
"time_to_open 0.848079\n",
"y_has_purchased 1.000000\n",
"dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
" dataset_train.isna().sum()/dataset_train.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "75f9a672-641f-49a2-a8d6-7673845506f5",
"metadata": {},
"outputs": [],
"source": [
"#Creation de la variable dependante fictive: 1 si l'individu a effectué un achat au cours de la periode de train et 0 sinon\n",
"\n",
"dataset_train_modif=dataset_train\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c121c1e2-d8e4-4b93-a882-9385581b63c9",
"metadata": {},
"outputs": [],
"source": [
"dataset_train_modif[\""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff