Merge branch 'data_construction'

This commit is contained in:
Antoine JOUBREL 2024-02-20 22:46:14 +00:00
commit 29eafcc6b2
8 changed files with 348 additions and 1030 deletions

58
0_1_Input_cleaning.py Normal file
View File

@ -0,0 +1,58 @@
# Business Data Challenge - Team 1
import pandas as pd
import numpy as np
import os
import s3fs
import re
import warnings
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Import cleaning and merge functions
exec(open('0_Cleaning_and_merge_functions.py').read())
# Output folder
BUCKET_OUT = "projet-bdc2324-team1"
# Ignore warning
warnings.filterwarnings('ignore')
def export_dataset(df, output_name):
print('Exportation of dataset :', output_name)
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + output_name
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
df.to_csv(file_out, index = False)
## 1 - Cleaning of the datasets
for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"):
# Cleaning customerplus
df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
## Exportation
export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv")
# Cleaning target area
df1_target_information = preprocessing_target_area(directory_path = tenant_id)
## Exportation
export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv")
# Cleaning campaign area
df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id)
## Exportation
export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
## Exportation
# export_dataset(df = df1_campaigns_information, output_name = "0_Temp/Company 1 - Campaigns dataset clean.csv")
# Cleaning product area
df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
## Exportation
export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
#Exportation
# export_dataset(df = df1_products_purchased_reduced, output_name = "0_Temp/Company 1 - Purchases.csv")
print("\n ------------------------------------------------------------------ \n --------------------- END CLEANING COMPANY " + tenant_id + " --------------------- \n ------------------------------------------------------------------")

128
0_2_Dataset_construction.py Normal file
View File

@ -0,0 +1,128 @@
# Business Data Challenge - Team 1
import pandas as pd
import numpy as np
import os
import s3fs
import re
import warnings
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Import cleaning and merge functions
exec(open('0_KPI_functions.py').read())
# Ignore warning
warnings.filterwarnings('ignore')
def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Import customerplus
df_customerplus_clean = display_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
# Filtre de cohérence pour la mise en pratique de notre méthode
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
#Filtre de la base df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
#Filtre de la base df_products_purchased_reduced
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
print("Data filtering : SUCCESS")
# Fusion de l'ensemble et creation des KPI
# KPI sur les campagnes publicitaires
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
# KPI sur le comportement d'achat
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
# KPI sur les données socio-demographique
## Le genre
df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
0: 'female',
1: 'male',
2: 'other'
})
gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
## Indicatrice si individue vit en France
df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
print("KPIs construction : SUCCESS")
# Fusion avec KPI liés au customer
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
# Fusion avec KPI liés au comportement d'achat
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
# Fill NaN values
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
print("Explanatory variable construction : SUCCESS")
# 2. Construction of the explained variable
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
# Indicatrice d'achat
df_products_purchased_to_predict['y_has_purchased'] = 1
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
print("Explained variable construction : SUCCESS")
# 3. Merge between explained and explanatory variables
dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
# 0 if there is no purchase
dataset[['y_has_purchased']].fillna(0)
return dataset
## Exportation
# Dossier d'exportation
BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
# Dataset test
dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1")
# # Exportation
# FILE_KEY_OUT_S3 = "dataset_test.csv"
# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
# dataset_test.to_csv(file_out, index = False)
# print("Exportation dataset test : SUCCESS")
# Dataset train
dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1")
# Exportation
# FILE_KEY_OUT_S3 = "dataset_train.csv"
# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
# dataset_train.to_csv(file_out, index = False)
# print("Exportation dataset train : SUCCESS")
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")

View File

@ -1,193 +0,0 @@
# Business Data Challenge - Team 1
import pandas as pd
import numpy as np
import os
import s3fs
import re
import warnings
# Import cleaning and merge functions
exec(open('BDC-team-1/0_Cleaning_and_merge_functions.py').read())
exec(open('BDC-team-1/0_KPI_functions.py').read())
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Ignore warning
warnings.filterwarnings('ignore')
# Data loading
BUCKET = "bdc2324-data/1"
liste_database = fs.ls(BUCKET)
# loop to create dataframes from liste
client_number = liste_database[0].split("/")[1]
df_prefix = "df" + str(client_number) + "_"
for i in range(len(liste_database)) :
current_path = liste_database[i]
with fs.open(current_path, mode="rb") as file_in:
df = pd.read_csv(file_in)
# the pattern of the name is df1xxx
nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
globals()[nom_dataframe] = df
## 1 - Cleaning of the datasets
# Cleaning customerplus
df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)
# Cleaning target area
df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)
# Cleaning campaign area
df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)
# Exportation
BUCKET_OUT = "projet-bdc2324-team1"
FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Campaigns dataset clean.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
df1_campaigns_information.to_csv(file_out, index = False)
## Cleaning product area
# Cleaning ticket area
df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)
BUCKET = "bdc2324-data"
directory_path = '1'
products_theme = create_products_table()
events_theme= create_events_table()
representation_theme = create_representations_table()
products_global = uniform_product_df()
# Fusion liée au product
df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
# Selection des variables d'intérêts
df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
#Exportation
BUCKET_OUT = "projet-bdc2324-team1"
FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Purchases.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
df1_products_purchased_reduced.to_csv(file_out, index = False)
## 2 - Construction of KPIs on a given period
def explanatory_variables(min_date, max_date, df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean):
# Filtre de cohérence pour la mise en pratique de notre méthode
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
#Filtre de la base df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= max_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= max_date] = np.datetime64('NaT')
#Filtre de la base df_products_purchased_reduced
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
print("Data filtering : SUCCESS")
# Fusion de l'ensemble et creation des KPI
# KPI sur les campagnes publicitaires
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
# KPI sur le comportement d'achat
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
# KPI sur les données socio-demographique
## Le genre
df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
0: 'female',
1: 'male',
2: 'other'
})
gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
## Indicatrice si individue vit en France
df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
print("KPIs construction : SUCCESS")
# Fusion avec KPI liés au customer
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
# Fusion avec KPI liés au comportement d'achat
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
# Fill NaN values
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
print("Explanatory variable construction : SUCCESS")
return df_customer_product
# Fonction pour créer les variables expliquée
def explained_variable(min_date, max_date, df_products_purchased_reduced = df1_products_purchased_reduced):
# Filtrer la base d'achat
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > min_date)]
# Indicatrice d'achat
df_products_purchased_reduced['y_has_purchased'] = 1
y = df_products_purchased_reduced[['customer_id', 'event_type_id', 'y_has_purchased']].drop_duplicates()
print("Explained variable construction : SUCCESS")
return y
## Exportation
# Dossier d'exportation
BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
# Dataset test
X_test = explanatory_variables(min_date = "2021-08-01", max_date = "2023-08-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
y_test = explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced)
dataset_test = pd.merge(X_test, y_test, on = ['customer_id', 'event_type_id'], how = 'left')
# Exportation
FILE_KEY_OUT_S3 = "dataset_test.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_test.to_csv(file_out, index = False)
print("Exportation dataset test : SUCCESS")
# Dataset train
X_train = explanatory_variables(min_date = "2021-05-01", max_date = "2023-05-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
y_train = explained_variable(min_date = "2023-05-01", max_date = "2023-08-01", df_products_purchased_reduced = df1_products_purchased_reduced)
dataset_train = pd.merge(X_train, y_train, on = ['customer_id', 'event_type_id'], how = 'left')
# Exportation
FILE_KEY_OUT_S3 = "dataset_train.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_train.to_csv(file_out, index = False)
print("Exportation dataset train : SUCCESS")
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")

View File

@ -1,38 +1,92 @@
# Cleaning and merge functions
#### Cleaning and merge functions ####
# Cleaning function
BUCKET = "bdc2324-data"
# 1. Basic cleaning functions
def cleaning_date(df, column_name):
"""
Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.
Parameters:
- df: DataFrame
Le DataFrame contenant la colonne à nettoyer.
- column_name: str
Le nom de la colonne à nettoyer.
Returns:
- DataFrame
Le DataFrame modifié avec la colonne nettoyée.
Datetime columns cleaning with ISO format
"""
df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')
return df
def preprocessing_customerplus(customerplus = None):
def display_databases(directory_path, file_name):
"""
This function returns the file from s3 storage
"""
file_path = BUCKET + "/" + directory_path + "/" + directory_path + file_name + ".csv"
print("File path : ", file_path)
with fs.open(file_path, mode="rb") as file_in:
df = pd.read_csv(file_in, sep=",")
customerplus_copy = customerplus.copy()
print("Shape : ", df.shape)
return df
def remove_horodates(df):
"""
this function remove horodate columns like created_at and updated_at
"""
df = df.drop(columns = ["created_at", "updated_at"])
return df
def order_columns_id(df):
"""
this function puts all id columns at the beginning in order to read the dataset easier
"""
substring = 'id'
id_columns = [col for col in df.columns if substring in col]
remaining_col = [col for col in df.columns if substring not in col]
new_order = id_columns + remaining_col
return df[new_order]
def process_df_2(df):
"""
This function organizes dataframe
"""
df = remove_horodates(df)
print("Number of columns : ", len(df.columns))
df = order_columns_id(df)
print("Columns : ", df.columns)
return df
def load_dataset(directory_path, name):
"""
This function loads csv file
"""
df = display_databases(directory_path, file_name = name)
df = process_df_2(df)
# drop na :
#df = df.dropna(axis=1, thresh=len(df))
# if identifier in table : delete it
if 'identifier' in df.columns:
df = df.drop(columns = 'identifier')
return df
# 2. Creation of cleaned and merged datasets
def preprocessing_customerplus(directory_path):
customerplus_copy = load_dataset(directory_path, name = "customersplus")
# Passage en format date
cleaning_date(customerplus_copy, 'first_buying_date')
cleaning_date(customerplus_copy, 'last_visiting_date')
# Selection des variables
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
return customerplus_copy
def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):
def preprocessing_tickets_area(directory_path):
# Datasets loading
tickets = load_dataset(directory_path, name = "tickets")
purchases = load_dataset(directory_path, name = "purchases")
suppliers = load_dataset(directory_path, name = "suppliers")
type_ofs = load_dataset(directory_path, name = "type_ofs")
# Base des tickets
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
@ -48,7 +102,7 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non
# Base des achats
# Nettoyage de la date d'achat
cleaning_date(purchases, 'purchase_date')
# cleaning_date(purchases, 'purchase_date')
# Selection des variables
purchases = purchases[['id', 'purchase_date', 'customer_id']]
@ -67,8 +121,13 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non
return ticket_information
def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):
# Target.csv cleaning
def preprocessing_target_area(directory_path):
# Datasets loading
targets = load_dataset(directory_path, name = "targets")
target_types = load_dataset(directory_path, name = "target_types")
customer_target_mappings = load_dataset(directory_path, name = "customer_target_mappings")
# target cleaning
targets = targets[["id", "target_type_id", "name"]]
targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)
@ -88,16 +147,21 @@ def preprocessing_target_area(targets = None, target_types = None, customer_targ
return targets_full
def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
def preprocessing_campaigns_area(directory_path):
# Datasets loading
campaign_stats = load_dataset(directory_path, name = "campaign_stats")
campaigns = load_dataset(directory_path, name = "campaigns")
# campaign_stats cleaning
campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]]
cleaning_date(campaign_stats, 'opened_at')
cleaning_date(campaign_stats, 'sent_at')
cleaning_date(campaign_stats, 'delivered_at')
# cleaning_date(campaign_stats, 'opened_at')
# cleaning_date(campaign_stats, 'sent_at')
# cleaning_date(campaign_stats, 'delivered_at')
# campaigns cleaning
campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_")
cleaning_date(campaigns, 'campaign_sent_at')
# cleaning_date(campaigns, 'campaign_sent_at')
# Merge
campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left")
@ -105,66 +169,11 @@ def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
return campaigns_full
def display_databases(file_name):
"""
This function returns the file from s3 storage
"""
file_path = BUCKET + "/" + directory_path + "/" + file_name
print("File path : ", file_path)
with fs.open(file_path, mode="rb") as file_in:
df = pd.read_csv(file_in, sep=",")
print("Shape : ", df.shape)
return df
def remove_horodates(df):
"""
this function remove horodate columns like created_at and updated_at
"""
df = df.drop(columns = ["created_at", "updated_at"])
return df
def order_columns_id(df):
"""
this function puts all id columns at the beginning in order to read the dataset easier
"""
substring = 'id'
id_columns = [col for col in df.columns if substring in col]
remaining_col = [col for col in df.columns if substring not in col]
new_order = id_columns + remaining_col
return df[new_order]
def process_df_2(df):
"""
This function organizes dataframe
"""
df = remove_horodates(df)
print("Number of columns : ", len(df.columns))
df = order_columns_id(df)
print("Columns : ", df.columns)
return df
def load_dataset(name):
"""
This function loads csv file
"""
df = display_databases(name)
df = process_df_2(df)
# drop na :
#df = df.dropna(axis=1, thresh=len(df))
# if identifier in table : delete it
if 'identifier' in df.columns:
df = df.drop(columns = 'identifier')
return df
def create_products_table():
def create_products_table(directory_path):
# first merge products and categories
print("first merge products and categories")
products = load_dataset("1products.csv")
categories = load_dataset("1categories.csv")
products = load_dataset(directory_path, name = "products")
categories = load_dataset(directory_path, name = "categories")
# Drop useless columns
products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])
categories = categories.drop(columns = ['extra_field', 'quota'])
@ -176,7 +185,7 @@ def create_products_table():
# Second merge products_theme and type of categories
print("Second merge products_theme and type of categories")
type_of_categories = load_dataset("1type_of_categories.csv")
type_of_categories = load_dataset(directory_path, name = "type_of_categories")
type_of_categories = type_of_categories.drop(columns = 'id')
products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
right_on = 'category_id' )
@ -187,11 +196,11 @@ def create_products_table():
return products_theme
def create_events_table():
def create_events_table(directory_path):
# first merge events and seasons :
print("first merge events and seasons : ")
events = load_dataset("1events.csv")
seasons = load_dataset("1seasons.csv")
events = load_dataset(directory_path, name = "events")
seasons = load_dataset(directory_path, name = "seasons")
# Drop useless columns
events = events.drop(columns = ['manual_added', 'is_display'])
@ -201,7 +210,7 @@ def create_events_table():
# Secondly merge events_theme and event_types
print("Secondly merge events_theme and event_types : ")
event_types = load_dataset("1event_types.csv")
event_types = load_dataset(directory_path, name = "event_types")
event_types = event_types.drop(columns = ['fidelity_delay'])
events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
@ -210,7 +219,7 @@ def create_events_table():
# thirdly merge events_theme and facilities
print("thirdly merge events_theme and facilities : ")
facilities = load_dataset("1facilities.csv")
facilities = load_dataset(directory_path, name = "facilities")
facilities = facilities.drop(columns = ['fixed_capacity'])
events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
@ -222,14 +231,13 @@ def create_events_table():
events_theme = order_columns_id(events_theme)
return events_theme
def create_representations_table():
representations = load_dataset("1representations.csv")
def create_representations_table(directory_path):
representations = load_dataset(directory_path, name = "representations")
representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
'representation_type_id'])
representations_capacity = load_dataset("1representation_category_capacities.csv")
representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
representations_theme = representations.merge(representations_capacity, how='left',
@ -240,22 +248,27 @@ def create_representations_table():
representations_theme = order_columns_id(representations_theme)
return representations_theme
def uniform_product_df():
def uniform_product_df(directory_path):
"""
This function returns the uniform product dataset
"""
products_theme = create_products_table(directory_path)
representation_theme = create_representations_table(directory_path)
events_theme = create_events_table(directory_path)
ticket_information = preprocessing_tickets_area(directory_path)
print("Products theme columns : ", products_theme.columns)
print("\n Representation theme columns : ", representation_theme.columns)
print("\n Events theme columns : ", events_theme.columns)
products_global = products_theme.merge(representation_theme, how='left',
products_global = pd.merge(products_theme, representation_theme, how='left',
on= ["representation_id", "category_id"])
products_global = products_global.merge(events_theme, how='left', on='event_id',
products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
suffixes = ("_representation", "_event"))
products_global = order_columns_id(products_global)
products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
# remove useless columns
products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'
return products_global
products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
return products_purchased_reduced

View File

@ -1,6 +1,20 @@
# Function de construction de KPI
def custom_date_parser(date_string):
return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
def display_databases(directory_path, file_name, datetime_col = None):
"""
This function returns the file from s3 storage
"""
file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
print("File path : ", file_path)
with fs.open(file_path, mode="rb") as file_in:
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
return df
def campaigns_kpi_function(campaigns_information = None):
# Nombre de campagnes de mails
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
@ -35,17 +49,17 @@ def tickets_kpi_function(tickets_information = None):
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)
# Proportion de vente en ligne
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index()
prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)
# Average amount
avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
.agg({"amount" : "mean"}).reset_index()
.rename(columns = {'amount' : 'avg_amount'}))
# avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
# .agg({"amount" : "mean"}).reset_index()
# .rename(columns = {'amount' : 'avg_amount'}))
tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
.groupby(['customer_id', 'event_type_id'])
tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
.groupby(['customer_id'])
.agg({'ticket_id': 'count',
'purchase_id' : 'nunique',
'amount' : 'sum',
@ -61,8 +75,7 @@ def tickets_kpi_function(tickets_information = None):
'purchase_id_nunique' : 'nb_purchases',
'amount_sum' : 'total_amount',
'supplier_name_nunique' : 'nb_suppliers',
'customer_id_' : 'customer_id',
'event_type_id_' : 'event_type_id'}, inplace = True)
'customer_id_' : 'customer_id'}, inplace = True)
tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
@ -73,10 +86,10 @@ def tickets_kpi_function(tickets_information = None):
tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)
tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
# tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
return tickets_kpi

View File

@ -615,19 +615,15 @@
"FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" purchases = pd.read_csv(file_in, sep=\",\")\n",
" purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'])\n",
" \n",
"purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True, format = 'ISO8601')\n",
"\n",
"# Emails\n",
"BUCKET = \"projet-bdc2324-team1\"\n",
"FILE_KEY_S3 = \"0_Temp/Company 1 - Campaigns dataset clean.csv\"\n",
"FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaigns = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"campaigns['sent_at'] = pd.to_datetime(campaigns['sent_at'], utc = True, format = 'ISO8601')\n"
" campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = ['sent_at'])\n"
]
},
{
@ -818,7 +814,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 16,
"id": "f663d68b-8a5c-4804-b31a-4477a03ca1e4",
"metadata": {
"scrolled": true
@ -906,7 +902,7 @@
"max 641981.000000 1.256574e+06"
]
},
"execution_count": 33,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@ -917,7 +913,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"id": "d1212b10-3933-450a-b001-9e2cbf308f79",
"metadata": {},
"outputs": [
@ -1219,7 +1215,7 @@
"[1826672 rows x 15 columns]"
]
},
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@ -1238,7 +1234,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"id": "dc45c1cd-2a78-48a6-aa2b-6a501254b6f2",
"metadata": {},
"outputs": [
@ -1458,7 +1454,7 @@
"[5 rows x 40 columns]"
]
},
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@ -1478,7 +1474,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"id": "89fcb455-efb4-4ad4-ab88-efd6c8a76287",
"metadata": {},
"outputs": [
@ -1499,7 +1495,7 @@
" dtype='object')"
]
},
"execution_count": 18,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@ -1510,7 +1506,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 20,
"id": "d7b2356a-d5fc-4547-b3ff-fded0e304fb6",
"metadata": {},
"outputs": [
@ -1634,7 +1630,7 @@
"9 0.0 "
]
},
"execution_count": 19,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@ -1653,7 +1649,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"id": "5559748f-1745-4651-a9f6-94702c7ee66f",
"metadata": {},
"outputs": [
@ -1813,7 +1809,7 @@
"max 434.000000 "
]
},
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@ -1835,7 +1831,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 22,
"id": "4971e35d-a762-4e18-9443-fd9571bd3f1e",
"metadata": {},
"outputs": [
@ -1864,7 +1860,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 23,
"id": "bc65a711-d172-4839-b487-3047280fc3a6",
"metadata": {},
"outputs": [
@ -1894,7 +1890,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 24,
"id": "c95cc35c-abfc-47c7-9b8a-ac69bfd60dd8",
"metadata": {},
"outputs": [
@ -1922,7 +1918,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 25,
"id": "49d5fd2d-9bc1-43ac-9270-1efd73759854",
"metadata": {},
"outputs": [
@ -1967,7 +1963,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 26,
"id": "e50e2583-4b8f-478e-87ac-591dde200af8",
"metadata": {},
"outputs": [
@ -1988,7 +1984,7 @@
" dtype='object')"
]
},
"execution_count": 25,
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@ -1999,7 +1995,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 27,
"id": "c724a315-9fe8-4874-be8f-a8115b17b5e2",
"metadata": {},
"outputs": [],
@ -2021,7 +2017,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 28,
"id": "58af5dcb-673e-4f4d-ad5c-f66ce1e8a22c",
"metadata": {},
"outputs": [
@ -2042,7 +2038,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 29,
"id": "cc3437f7-8b36-4398-9da6-ff15e8e4c8d7",
"metadata": {},
"outputs": [

View File

@ -1,695 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"bdc2324-data\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
"metadata": {},
"outputs": [],
"source": [
"# Chargement des fichiers campaign_stats.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
"metadata": {},
"outputs": [],
"source": [
"# Conversion des dates 'sent_at'\n",
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
"metadata": {},
"outputs": [],
"source": [
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
"print(campaign_stats_1['sent_at'].max())\n",
"print(campaign_stats_1['sent_at'].min())\n",
"\n",
"print(campaign_stats_2['sent_at'].max())\n",
"print(campaign_stats_2['sent_at'].min())\n",
"\n",
"print(campaign_stats_3['sent_at'].max())\n",
"print(campaign_stats_3['sent_at'].min())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
"metadata": {},
"outputs": [],
"source": [
"campaign_stats_1['sent_at']"
]
},
{
"cell_type": "markdown",
"id": "31f2edbf-5661-4516-9835-06d4da615c13",
"metadata": {},
"source": [
"### Customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
"metadata": {},
"outputs": [],
"source": [
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "460f853a-68c0-42a7-9877-b83d3aaec813",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_1.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_1.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_1['id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_2['id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b40a653e-013f-48d0-8b57-0284587b36c5",
"metadata": {},
"outputs": [],
"source": [
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "32fa2215-3c79-40b5-8643-755865959fc7",
"metadata": {},
"outputs": [],
"source": [
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
"# Exemple id commun = caractéristiques communes\n",
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
"\n",
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"customers_plus_1.isna().mean()*100"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6f6ce60d-0912-497d-9108-330acccef394",
"metadata": {},
"outputs": [],
"source": [
"# Chargement de toutes les données\n",
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
"\n",
"for nom_base in liste_base:\n",
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Jointure\n",
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
"df_customer_event"
]
},
{
"cell_type": "markdown",
"id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f",
"metadata": {},
"source": [
"# Fusion et exploration"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "22bfad2b-d52a-4077-9b39-bee35004e01c",
"metadata": {},
"outputs": [],
"source": [
"# Jointure\n",
"var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
"\n",
"var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
"\n",
"var_choosed.remove('representation_id')\n",
"var_choosed.extend(['start_date_time', 'event_id'])\n",
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
"\n",
"var_choosed.remove('event_id')\n",
"var_choosed.extend(['name', 'customer_id'])\n",
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
"\n",
"# Changement de nom\n",
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
"var_choosed[var_choosed.index('name')] = \"event_name\"\n",
"\n",
"# Base finale\n",
"var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
"df_customer_event"
]
},
{
"cell_type": "markdown",
"id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31",
"metadata": {},
"source": [
"## Type de client au globale"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1",
"metadata": {},
"outputs": [],
"source": [
"# Client\n",
"print(customer_target_mappings.columns)\n",
"print(customer_target_mappings.shape)\n",
"customer_target_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f",
"metadata": {},
"outputs": [],
"source": [
"customer_target_mappings['extra_field'].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec",
"metadata": {},
"outputs": [],
"source": [
"customer_target_mappings['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4",
"metadata": {},
"outputs": [],
"source": [
"# Segmentation existante\n",
"print(target_types.columns)\n",
"print(target_types.shape)\n",
"target_types.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5adb1773-648d-4683-bc08-d1f2298c1283",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"target_types"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d65f74e-47fc-4296-b493-a1ebefb91cde",
"metadata": {},
"outputs": [],
"source": [
"# Tags = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" tags = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(tags.columns)\n",
"print(tags.shape)\n",
"tags.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a689a63-165b-4c4e-bbb0-695b661048d9",
"metadata": {},
"outputs": [],
"source": [
"tags"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "69e38c52-0570-4531-aebb-9deb6db8c40b",
"metadata": {},
"outputs": [],
"source": [
"# Structure = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(structure_tag_mappings.columns)\n",
"print(structure_tag_mappings.shape)\n",
"structure_tag_mappings.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74dc34ad-375b-48df-a900-40d92c5fff13",
"metadata": {},
"outputs": [],
"source": [
"structure_tag_mappings"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe",
"metadata": {},
"outputs": [],
"source": [
"# Tags = clients\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customersplus = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(customersplus.columns)\n",
"print(customersplus.shape)\n",
"customersplus.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "383e892c-606a-45ce-bdd6-b503b3e0be33",
"metadata": {},
"outputs": [],
"source": [
"customersplus"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70324d06-b855-4386-a7de-eef1eb13dfdf",
"metadata": {},
"outputs": [],
"source": [
"# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c",
"metadata": {},
"outputs": [],
"source": [
"# tickets\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" tickets = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(tickets.columns)\n",
"print(tickets.shape)\n",
"tickets.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea83ea5c-3d47-4a66-a523-04b69b149a20",
"metadata": {},
"outputs": [],
"source": [
"tickets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6",
"metadata": {},
"outputs": [],
"source": [
"tickets['type_of'].unique()"
]
},
{
"cell_type": "markdown",
"id": "bc192b08-30a5-486a-8bea-93e765dbfce6",
"metadata": {},
"source": [
"## Types d'évenement et client"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e14dcf62-2def-4ed5-834b-cf21abbc2894",
"metadata": {},
"outputs": [],
"source": [
"# Evenement = events.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" events = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(events.columns)\n",
"print(events.shape)\n",
"events.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316",
"metadata": {},
"outputs": [],
"source": [
"events"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af80eee8-f717-4159-a0fd-09d47ec96621",
"metadata": {},
"outputs": [],
"source": [
"events['name'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2",
"metadata": {},
"outputs": [],
"source": [
"# Représentation des évenements = representations.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" representations = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(representations.columns)\n",
"print(representations.shape)\n",
"representations.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1487402a-a49b-4737-b7d7-40c764d2f0b4",
"metadata": {},
"outputs": [],
"source": [
"representations"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99b27418-2c15-4a6e-bcf5-d329ca492085",
"metadata": {},
"outputs": [],
"source": [
"# Produits vendues = products.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" products = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(products.columns)\n",
"print(products.shape)\n",
"products.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c49bcd47-672f-4e0f-aee9-a7475151b97f",
"metadata": {},
"outputs": [],
"source": [
"products"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4aec5ce-d0c9-4625-bb29-9ac154818621",
"metadata": {},
"outputs": [],
"source": [
"# Lieu = facilities.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" facilities = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(facilities.columns)\n",
"print(facilities.shape)\n",
"facilities.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3642483-2879-442a-ad69-efcd2331a200",
"metadata": {},
"outputs": [],
"source": [
"facilities"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "da1e9807-2a8d-4be7-a785-55cffd734f36",
"metadata": {},
"outputs": [],
"source": [
"# Saisons = seasons.csv période sur deux années consécutives\n",
"FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" seasons = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(seasons.columns)\n",
"print(seasons.shape)\n",
"seasons.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9",
"metadata": {},
"outputs": [],
"source": [
"seasons['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abb3aa20-774b-4761-983a-df5eb2bc51c6",
"metadata": {},
"outputs": [],
"source": [
"# Achats = purchases.csv \n",
"FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" purchases = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"print(purchases.columns)\n",
"print(purchases.shape)\n",
"purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30e204ab-4f63-430c-a818-5c8035b6e17b",
"metadata": {},
"outputs": [],
"source": [
"purchases"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -124,9 +124,7 @@
{
"cell_type": "markdown",
"id": "e855f403",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"metadata": {},
"source": [
"## customersplus.csv"
]
@ -1289,7 +1287,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
"version": "3.11.6"
}
},
"nbformat": 4,