Merge branch 'data_construction'
This commit is contained in:
commit
29eafcc6b2
58
0_1_Input_cleaning.py
Normal file
58
0_1_Input_cleaning.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
# Business Data Challenge - Team 1
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import s3fs
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
# Create filesystem object
|
||||||
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
# Import cleaning and merge functions
|
||||||
|
exec(open('0_Cleaning_and_merge_functions.py').read())
|
||||||
|
|
||||||
|
# Output folder
|
||||||
|
BUCKET_OUT = "projet-bdc2324-team1"
|
||||||
|
|
||||||
|
# Ignore warning
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
|
||||||
|
def export_dataset(df, output_name):
|
||||||
|
print('Exportation of dataset :', output_name)
|
||||||
|
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + output_name
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
|
df.to_csv(file_out, index = False)
|
||||||
|
|
||||||
|
## 1 - Cleaning of the datasets
|
||||||
|
for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"):
|
||||||
|
# Cleaning customerplus
|
||||||
|
df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
|
||||||
|
|
||||||
|
## Exportation
|
||||||
|
export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv")
|
||||||
|
|
||||||
|
# Cleaning target area
|
||||||
|
df1_target_information = preprocessing_target_area(directory_path = tenant_id)
|
||||||
|
## Exportation
|
||||||
|
export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv")
|
||||||
|
|
||||||
|
# Cleaning campaign area
|
||||||
|
df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id)
|
||||||
|
## Exportation
|
||||||
|
export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
|
||||||
|
|
||||||
|
## Exportation
|
||||||
|
# export_dataset(df = df1_campaigns_information, output_name = "0_Temp/Company 1 - Campaigns dataset clean.csv")
|
||||||
|
|
||||||
|
# Cleaning product area
|
||||||
|
df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
|
||||||
|
## Exportation
|
||||||
|
export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
|
||||||
|
#Exportation
|
||||||
|
# export_dataset(df = df1_products_purchased_reduced, output_name = "0_Temp/Company 1 - Purchases.csv")
|
||||||
|
|
||||||
|
print("\n ------------------------------------------------------------------ \n --------------------- END CLEANING COMPANY " + tenant_id + " --------------------- \n ------------------------------------------------------------------")
|
128
0_2_Dataset_construction.py
Normal file
128
0_2_Dataset_construction.py
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
# Business Data Challenge - Team 1
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import s3fs
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
# Create filesystem object
|
||||||
|
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
||||||
|
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
||||||
|
|
||||||
|
|
||||||
|
# Import cleaning and merge functions
|
||||||
|
exec(open('0_KPI_functions.py').read())
|
||||||
|
|
||||||
|
# Ignore warning
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
def dataset_construction(min_date, end_features_date, max_date, directory_path):
|
||||||
|
|
||||||
|
# Import customerplus
|
||||||
|
df_customerplus_clean = display_databases(directory_path, file_name = "customerplus_cleaned")
|
||||||
|
df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
|
||||||
|
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
|
||||||
|
|
||||||
|
# Filtre de cohérence pour la mise en pratique de notre méthode
|
||||||
|
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
|
||||||
|
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
|
||||||
|
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
||||||
|
|
||||||
|
#Filtre de la base df_campaigns_information
|
||||||
|
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
||||||
|
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
|
||||||
|
|
||||||
|
#Filtre de la base df_products_purchased_reduced
|
||||||
|
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
||||||
|
|
||||||
|
print("Data filtering : SUCCESS")
|
||||||
|
|
||||||
|
# Fusion de l'ensemble et creation des KPI
|
||||||
|
|
||||||
|
# KPI sur les campagnes publicitaires
|
||||||
|
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
|
||||||
|
|
||||||
|
# KPI sur le comportement d'achat
|
||||||
|
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
|
||||||
|
|
||||||
|
# KPI sur les données socio-demographique
|
||||||
|
|
||||||
|
## Le genre
|
||||||
|
df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
|
||||||
|
0: 'female',
|
||||||
|
1: 'male',
|
||||||
|
2: 'other'
|
||||||
|
})
|
||||||
|
gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
|
||||||
|
df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
|
||||||
|
|
||||||
|
## Indicatrice si individue vit en France
|
||||||
|
df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
||||||
|
|
||||||
|
print("KPIs construction : SUCCESS")
|
||||||
|
|
||||||
|
# Fusion avec KPI liés au customer
|
||||||
|
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
||||||
|
|
||||||
|
# Fill NaN values
|
||||||
|
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
|
||||||
|
|
||||||
|
# Fusion avec KPI liés au comportement d'achat
|
||||||
|
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
|
||||||
|
|
||||||
|
# Fill NaN values
|
||||||
|
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
|
||||||
|
|
||||||
|
print("Explanatory variable construction : SUCCESS")
|
||||||
|
|
||||||
|
# 2. Construction of the explained variable
|
||||||
|
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
|
||||||
|
|
||||||
|
# Indicatrice d'achat
|
||||||
|
df_products_purchased_to_predict['y_has_purchased'] = 1
|
||||||
|
|
||||||
|
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
|
||||||
|
|
||||||
|
print("Explained variable construction : SUCCESS")
|
||||||
|
|
||||||
|
# 3. Merge between explained and explanatory variables
|
||||||
|
dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
|
||||||
|
|
||||||
|
# 0 if there is no purchase
|
||||||
|
dataset[['y_has_purchased']].fillna(0)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
## Exportation
|
||||||
|
|
||||||
|
# Dossier d'exportation
|
||||||
|
BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
|
||||||
|
|
||||||
|
# Dataset test
|
||||||
|
dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1")
|
||||||
|
|
||||||
|
# # Exportation
|
||||||
|
# FILE_KEY_OUT_S3 = "dataset_test.csv"
|
||||||
|
# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
||||||
|
|
||||||
|
# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
|
# dataset_test.to_csv(file_out, index = False)
|
||||||
|
|
||||||
|
# print("Exportation dataset test : SUCCESS")
|
||||||
|
|
||||||
|
# Dataset train
|
||||||
|
dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1")
|
||||||
|
|
||||||
|
# Exportation
|
||||||
|
# FILE_KEY_OUT_S3 = "dataset_train.csv"
|
||||||
|
# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
||||||
|
|
||||||
|
# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
|
# dataset_train.to_csv(file_out, index = False)
|
||||||
|
|
||||||
|
# print("Exportation dataset train : SUCCESS")
|
||||||
|
|
||||||
|
|
||||||
|
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
|
|
@ -1,193 +0,0 @@
|
||||||
# Business Data Challenge - Team 1
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import os
|
|
||||||
import s3fs
|
|
||||||
import re
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
# Import cleaning and merge functions
|
|
||||||
exec(open('BDC-team-1/0_Cleaning_and_merge_functions.py').read())
|
|
||||||
exec(open('BDC-team-1/0_KPI_functions.py').read())
|
|
||||||
|
|
||||||
# Create filesystem object
|
|
||||||
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
|
|
||||||
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
|
|
||||||
|
|
||||||
# Ignore warning
|
|
||||||
warnings.filterwarnings('ignore')
|
|
||||||
|
|
||||||
# Data loading
|
|
||||||
BUCKET = "bdc2324-data/1"
|
|
||||||
liste_database = fs.ls(BUCKET)
|
|
||||||
|
|
||||||
# loop to create dataframes from liste
|
|
||||||
client_number = liste_database[0].split("/")[1]
|
|
||||||
df_prefix = "df" + str(client_number) + "_"
|
|
||||||
|
|
||||||
for i in range(len(liste_database)) :
|
|
||||||
current_path = liste_database[i]
|
|
||||||
with fs.open(current_path, mode="rb") as file_in:
|
|
||||||
df = pd.read_csv(file_in)
|
|
||||||
# the pattern of the name is df1xxx
|
|
||||||
nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
|
|
||||||
globals()[nom_dataframe] = df
|
|
||||||
|
|
||||||
## 1 - Cleaning of the datasets
|
|
||||||
|
|
||||||
# Cleaning customerplus
|
|
||||||
df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)
|
|
||||||
|
|
||||||
# Cleaning target area
|
|
||||||
df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)
|
|
||||||
|
|
||||||
# Cleaning campaign area
|
|
||||||
df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)
|
|
||||||
|
|
||||||
# Exportation
|
|
||||||
BUCKET_OUT = "projet-bdc2324-team1"
|
|
||||||
FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Campaigns dataset clean.csv"
|
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
|
||||||
|
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
|
||||||
df1_campaigns_information.to_csv(file_out, index = False)
|
|
||||||
## Cleaning product area
|
|
||||||
|
|
||||||
# Cleaning ticket area
|
|
||||||
df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)
|
|
||||||
|
|
||||||
|
|
||||||
BUCKET = "bdc2324-data"
|
|
||||||
directory_path = '1'
|
|
||||||
|
|
||||||
products_theme = create_products_table()
|
|
||||||
events_theme= create_events_table()
|
|
||||||
representation_theme = create_representations_table()
|
|
||||||
products_global = uniform_product_df()
|
|
||||||
|
|
||||||
# Fusion liée au product
|
|
||||||
df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
|
|
||||||
|
|
||||||
# Selection des variables d'intérêts
|
|
||||||
df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
|
|
||||||
|
|
||||||
#Exportation
|
|
||||||
BUCKET_OUT = "projet-bdc2324-team1"
|
|
||||||
FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Purchases.csv"
|
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
|
||||||
|
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
|
||||||
df1_products_purchased_reduced.to_csv(file_out, index = False)
|
|
||||||
|
|
||||||
## 2 - Construction of KPIs on a given period
|
|
||||||
|
|
||||||
def explanatory_variables(min_date, max_date, df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean):
|
|
||||||
|
|
||||||
# Filtre de cohérence pour la mise en pratique de notre méthode
|
|
||||||
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
|
|
||||||
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
|
|
||||||
|
|
||||||
#Filtre de la base df_campaigns_information
|
|
||||||
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= max_date) & (df_campaigns_information['sent_at'] >= min_date)]
|
|
||||||
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= max_date] = np.datetime64('NaT')
|
|
||||||
|
|
||||||
#Filtre de la base df_products_purchased_reduced
|
|
||||||
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
|
|
||||||
|
|
||||||
print("Data filtering : SUCCESS")
|
|
||||||
|
|
||||||
# Fusion de l'ensemble et creation des KPI
|
|
||||||
|
|
||||||
# KPI sur les campagnes publicitaires
|
|
||||||
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
|
|
||||||
|
|
||||||
# KPI sur le comportement d'achat
|
|
||||||
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
|
|
||||||
|
|
||||||
# KPI sur les données socio-demographique
|
|
||||||
|
|
||||||
## Le genre
|
|
||||||
df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
|
|
||||||
0: 'female',
|
|
||||||
1: 'male',
|
|
||||||
2: 'other'
|
|
||||||
})
|
|
||||||
gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
|
|
||||||
df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
|
|
||||||
|
|
||||||
## Indicatrice si individue vit en France
|
|
||||||
df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
|
||||||
|
|
||||||
print("KPIs construction : SUCCESS")
|
|
||||||
|
|
||||||
# Fusion avec KPI liés au customer
|
|
||||||
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
|
|
||||||
|
|
||||||
# Fill NaN values
|
|
||||||
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
|
|
||||||
|
|
||||||
# Fusion avec KPI liés au comportement d'achat
|
|
||||||
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
|
|
||||||
|
|
||||||
# Fill NaN values
|
|
||||||
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
|
|
||||||
|
|
||||||
print("Explanatory variable construction : SUCCESS")
|
|
||||||
|
|
||||||
return df_customer_product
|
|
||||||
|
|
||||||
# Fonction pour créer les variables expliquée
|
|
||||||
def explained_variable(min_date, max_date, df_products_purchased_reduced = df1_products_purchased_reduced):
|
|
||||||
|
|
||||||
# Filtrer la base d'achat
|
|
||||||
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > min_date)]
|
|
||||||
|
|
||||||
# Indicatrice d'achat
|
|
||||||
df_products_purchased_reduced['y_has_purchased'] = 1
|
|
||||||
|
|
||||||
y = df_products_purchased_reduced[['customer_id', 'event_type_id', 'y_has_purchased']].drop_duplicates()
|
|
||||||
|
|
||||||
print("Explained variable construction : SUCCESS")
|
|
||||||
|
|
||||||
return y
|
|
||||||
|
|
||||||
## Exportation
|
|
||||||
|
|
||||||
# Dossier d'exportation
|
|
||||||
BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
|
|
||||||
|
|
||||||
# Dataset test
|
|
||||||
X_test = explanatory_variables(min_date = "2021-08-01", max_date = "2023-08-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
|
|
||||||
|
|
||||||
y_test = explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced)
|
|
||||||
|
|
||||||
dataset_test = pd.merge(X_test, y_test, on = ['customer_id', 'event_type_id'], how = 'left')
|
|
||||||
|
|
||||||
# Exportation
|
|
||||||
FILE_KEY_OUT_S3 = "dataset_test.csv"
|
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
|
||||||
|
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
|
||||||
dataset_test.to_csv(file_out, index = False)
|
|
||||||
|
|
||||||
print("Exportation dataset test : SUCCESS")
|
|
||||||
|
|
||||||
# Dataset train
|
|
||||||
X_train = explanatory_variables(min_date = "2021-05-01", max_date = "2023-05-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
|
|
||||||
|
|
||||||
y_train = explained_variable(min_date = "2023-05-01", max_date = "2023-08-01", df_products_purchased_reduced = df1_products_purchased_reduced)
|
|
||||||
|
|
||||||
dataset_train = pd.merge(X_train, y_train, on = ['customer_id', 'event_type_id'], how = 'left')
|
|
||||||
|
|
||||||
# Exportation
|
|
||||||
FILE_KEY_OUT_S3 = "dataset_train.csv"
|
|
||||||
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
|
|
||||||
|
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
|
||||||
dataset_train.to_csv(file_out, index = False)
|
|
||||||
|
|
||||||
print("Exportation dataset train : SUCCESS")
|
|
||||||
|
|
||||||
|
|
||||||
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
|
|
|
@ -1,38 +1,92 @@
|
||||||
# Cleaning and merge functions
|
#### Cleaning and merge functions ####
|
||||||
|
|
||||||
# Cleaning function
|
BUCKET = "bdc2324-data"
|
||||||
|
|
||||||
|
# 1. Basic cleaning functions
|
||||||
def cleaning_date(df, column_name):
|
def cleaning_date(df, column_name):
|
||||||
"""
|
"""
|
||||||
Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.
|
Datetime columns cleaning with ISO format
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- df: DataFrame
|
|
||||||
Le DataFrame contenant la colonne à nettoyer.
|
|
||||||
- column_name: str
|
|
||||||
Le nom de la colonne à nettoyer.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
- DataFrame
|
|
||||||
Le DataFrame modifié avec la colonne nettoyée.
|
|
||||||
"""
|
"""
|
||||||
df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')
|
df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def preprocessing_customerplus(customerplus = None):
|
def display_databases(directory_path, file_name):
|
||||||
|
"""
|
||||||
|
This function returns the file from s3 storage
|
||||||
|
"""
|
||||||
|
file_path = BUCKET + "/" + directory_path + "/" + directory_path + file_name + ".csv"
|
||||||
|
print("File path : ", file_path)
|
||||||
|
with fs.open(file_path, mode="rb") as file_in:
|
||||||
|
df = pd.read_csv(file_in, sep=",")
|
||||||
|
|
||||||
|
print("Shape : ", df.shape)
|
||||||
|
return df
|
||||||
|
|
||||||
customerplus_copy = customerplus.copy()
|
def remove_horodates(df):
|
||||||
|
"""
|
||||||
|
this function remove horodate columns like created_at and updated_at
|
||||||
|
"""
|
||||||
|
df = df.drop(columns = ["created_at", "updated_at"])
|
||||||
|
return df
|
||||||
|
|
||||||
|
def order_columns_id(df):
|
||||||
|
"""
|
||||||
|
this function puts all id columns at the beginning in order to read the dataset easier
|
||||||
|
"""
|
||||||
|
substring = 'id'
|
||||||
|
id_columns = [col for col in df.columns if substring in col]
|
||||||
|
remaining_col = [col for col in df.columns if substring not in col]
|
||||||
|
new_order = id_columns + remaining_col
|
||||||
|
return df[new_order]
|
||||||
|
|
||||||
|
def process_df_2(df):
|
||||||
|
"""
|
||||||
|
This function organizes dataframe
|
||||||
|
"""
|
||||||
|
df = remove_horodates(df)
|
||||||
|
print("Number of columns : ", len(df.columns))
|
||||||
|
df = order_columns_id(df)
|
||||||
|
print("Columns : ", df.columns)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def load_dataset(directory_path, name):
|
||||||
|
"""
|
||||||
|
This function loads csv file
|
||||||
|
"""
|
||||||
|
df = display_databases(directory_path, file_name = name)
|
||||||
|
df = process_df_2(df)
|
||||||
|
# drop na :
|
||||||
|
#df = df.dropna(axis=1, thresh=len(df))
|
||||||
|
# if identifier in table : delete it
|
||||||
|
if 'identifier' in df.columns:
|
||||||
|
df = df.drop(columns = 'identifier')
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
# 2. Creation of cleaned and merged datasets
|
||||||
|
|
||||||
|
def preprocessing_customerplus(directory_path):
|
||||||
|
|
||||||
|
customerplus_copy = load_dataset(directory_path, name = "customersplus")
|
||||||
|
|
||||||
# Passage en format date
|
# Passage en format date
|
||||||
cleaning_date(customerplus_copy, 'first_buying_date')
|
cleaning_date(customerplus_copy, 'first_buying_date')
|
||||||
cleaning_date(customerplus_copy, 'last_visiting_date')
|
cleaning_date(customerplus_copy, 'last_visiting_date')
|
||||||
|
|
||||||
# Selection des variables
|
# Selection des variables
|
||||||
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
|
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
|
||||||
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
|
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
|
||||||
|
|
||||||
return customerplus_copy
|
return customerplus_copy
|
||||||
|
|
||||||
def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):
|
def preprocessing_tickets_area(directory_path):
|
||||||
|
|
||||||
|
# Datasets loading
|
||||||
|
tickets = load_dataset(directory_path, name = "tickets")
|
||||||
|
purchases = load_dataset(directory_path, name = "purchases")
|
||||||
|
suppliers = load_dataset(directory_path, name = "suppliers")
|
||||||
|
type_ofs = load_dataset(directory_path, name = "type_ofs")
|
||||||
|
|
||||||
# Base des tickets
|
# Base des tickets
|
||||||
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
|
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
|
||||||
tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
|
tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
|
||||||
|
@ -48,7 +102,7 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non
|
||||||
|
|
||||||
# Base des achats
|
# Base des achats
|
||||||
# Nettoyage de la date d'achat
|
# Nettoyage de la date d'achat
|
||||||
cleaning_date(purchases, 'purchase_date')
|
# cleaning_date(purchases, 'purchase_date')
|
||||||
# Selection des variables
|
# Selection des variables
|
||||||
purchases = purchases[['id', 'purchase_date', 'customer_id']]
|
purchases = purchases[['id', 'purchase_date', 'customer_id']]
|
||||||
|
|
||||||
|
@ -67,8 +121,13 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non
|
||||||
|
|
||||||
return ticket_information
|
return ticket_information
|
||||||
|
|
||||||
def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):
|
def preprocessing_target_area(directory_path):
|
||||||
# Target.csv cleaning
|
|
||||||
|
# Datasets loading
|
||||||
|
targets = load_dataset(directory_path, name = "targets")
|
||||||
|
target_types = load_dataset(directory_path, name = "target_types")
|
||||||
|
customer_target_mappings = load_dataset(directory_path, name = "customer_target_mappings")
|
||||||
|
# target cleaning
|
||||||
targets = targets[["id", "target_type_id", "name"]]
|
targets = targets[["id", "target_type_id", "name"]]
|
||||||
targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)
|
targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)
|
||||||
|
|
||||||
|
@ -88,16 +147,21 @@ def preprocessing_target_area(targets = None, target_types = None, customer_targ
|
||||||
|
|
||||||
return targets_full
|
return targets_full
|
||||||
|
|
||||||
def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
|
def preprocessing_campaigns_area(directory_path):
|
||||||
|
|
||||||
|
# Datasets loading
|
||||||
|
campaign_stats = load_dataset(directory_path, name = "campaign_stats")
|
||||||
|
campaigns = load_dataset(directory_path, name = "campaigns")
|
||||||
|
|
||||||
# campaign_stats cleaning
|
# campaign_stats cleaning
|
||||||
campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]]
|
campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]]
|
||||||
cleaning_date(campaign_stats, 'opened_at')
|
# cleaning_date(campaign_stats, 'opened_at')
|
||||||
cleaning_date(campaign_stats, 'sent_at')
|
# cleaning_date(campaign_stats, 'sent_at')
|
||||||
cleaning_date(campaign_stats, 'delivered_at')
|
# cleaning_date(campaign_stats, 'delivered_at')
|
||||||
|
|
||||||
# campaigns cleaning
|
# campaigns cleaning
|
||||||
campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_")
|
campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_")
|
||||||
cleaning_date(campaigns, 'campaign_sent_at')
|
# cleaning_date(campaigns, 'campaign_sent_at')
|
||||||
|
|
||||||
# Merge
|
# Merge
|
||||||
campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left")
|
campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left")
|
||||||
|
@ -105,66 +169,11 @@ def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
|
||||||
|
|
||||||
return campaigns_full
|
return campaigns_full
|
||||||
|
|
||||||
def display_databases(file_name):
|
def create_products_table(directory_path):
|
||||||
"""
|
|
||||||
This function returns the file from s3 storage
|
|
||||||
"""
|
|
||||||
file_path = BUCKET + "/" + directory_path + "/" + file_name
|
|
||||||
print("File path : ", file_path)
|
|
||||||
with fs.open(file_path, mode="rb") as file_in:
|
|
||||||
df = pd.read_csv(file_in, sep=",")
|
|
||||||
|
|
||||||
print("Shape : ", df.shape)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def remove_horodates(df):
|
|
||||||
"""
|
|
||||||
this function remove horodate columns like created_at and updated_at
|
|
||||||
"""
|
|
||||||
df = df.drop(columns = ["created_at", "updated_at"])
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def order_columns_id(df):
|
|
||||||
"""
|
|
||||||
this function puts all id columns at the beginning in order to read the dataset easier
|
|
||||||
"""
|
|
||||||
substring = 'id'
|
|
||||||
id_columns = [col for col in df.columns if substring in col]
|
|
||||||
remaining_col = [col for col in df.columns if substring not in col]
|
|
||||||
new_order = id_columns + remaining_col
|
|
||||||
return df[new_order]
|
|
||||||
|
|
||||||
|
|
||||||
def process_df_2(df):
|
|
||||||
"""
|
|
||||||
This function organizes dataframe
|
|
||||||
"""
|
|
||||||
df = remove_horodates(df)
|
|
||||||
print("Number of columns : ", len(df.columns))
|
|
||||||
df = order_columns_id(df)
|
|
||||||
print("Columns : ", df.columns)
|
|
||||||
return df
|
|
||||||
|
|
||||||
def load_dataset(name):
|
|
||||||
"""
|
|
||||||
This function loads csv file
|
|
||||||
"""
|
|
||||||
df = display_databases(name)
|
|
||||||
df = process_df_2(df)
|
|
||||||
# drop na :
|
|
||||||
#df = df.dropna(axis=1, thresh=len(df))
|
|
||||||
# if identifier in table : delete it
|
|
||||||
if 'identifier' in df.columns:
|
|
||||||
df = df.drop(columns = 'identifier')
|
|
||||||
return df
|
|
||||||
|
|
||||||
def create_products_table():
|
|
||||||
# first merge products and categories
|
# first merge products and categories
|
||||||
print("first merge products and categories")
|
print("first merge products and categories")
|
||||||
products = load_dataset("1products.csv")
|
products = load_dataset(directory_path, name = "products")
|
||||||
categories = load_dataset("1categories.csv")
|
categories = load_dataset(directory_path, name = "categories")
|
||||||
# Drop useless columns
|
# Drop useless columns
|
||||||
products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])
|
products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])
|
||||||
categories = categories.drop(columns = ['extra_field', 'quota'])
|
categories = categories.drop(columns = ['extra_field', 'quota'])
|
||||||
|
@ -176,7 +185,7 @@ def create_products_table():
|
||||||
|
|
||||||
# Second merge products_theme and type of categories
|
# Second merge products_theme and type of categories
|
||||||
print("Second merge products_theme and type of categories")
|
print("Second merge products_theme and type of categories")
|
||||||
type_of_categories = load_dataset("1type_of_categories.csv")
|
type_of_categories = load_dataset(directory_path, name = "type_of_categories")
|
||||||
type_of_categories = type_of_categories.drop(columns = 'id')
|
type_of_categories = type_of_categories.drop(columns = 'id')
|
||||||
products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
|
products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
|
||||||
right_on = 'category_id' )
|
right_on = 'category_id' )
|
||||||
|
@ -187,11 +196,11 @@ def create_products_table():
|
||||||
return products_theme
|
return products_theme
|
||||||
|
|
||||||
|
|
||||||
def create_events_table():
|
def create_events_table(directory_path):
|
||||||
# first merge events and seasons :
|
# first merge events and seasons :
|
||||||
print("first merge events and seasons : ")
|
print("first merge events and seasons : ")
|
||||||
events = load_dataset("1events.csv")
|
events = load_dataset(directory_path, name = "events")
|
||||||
seasons = load_dataset("1seasons.csv")
|
seasons = load_dataset(directory_path, name = "seasons")
|
||||||
|
|
||||||
# Drop useless columns
|
# Drop useless columns
|
||||||
events = events.drop(columns = ['manual_added', 'is_display'])
|
events = events.drop(columns = ['manual_added', 'is_display'])
|
||||||
|
@ -201,7 +210,7 @@ def create_events_table():
|
||||||
|
|
||||||
# Secondly merge events_theme and event_types
|
# Secondly merge events_theme and event_types
|
||||||
print("Secondly merge events_theme and event_types : ")
|
print("Secondly merge events_theme and event_types : ")
|
||||||
event_types = load_dataset("1event_types.csv")
|
event_types = load_dataset(directory_path, name = "event_types")
|
||||||
event_types = event_types.drop(columns = ['fidelity_delay'])
|
event_types = event_types.drop(columns = ['fidelity_delay'])
|
||||||
|
|
||||||
events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
|
events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
|
||||||
|
@ -210,7 +219,7 @@ def create_events_table():
|
||||||
|
|
||||||
# thirdly merge events_theme and facilities
|
# thirdly merge events_theme and facilities
|
||||||
print("thirdly merge events_theme and facilities : ")
|
print("thirdly merge events_theme and facilities : ")
|
||||||
facilities = load_dataset("1facilities.csv")
|
facilities = load_dataset(directory_path, name = "facilities")
|
||||||
facilities = facilities.drop(columns = ['fixed_capacity'])
|
facilities = facilities.drop(columns = ['fixed_capacity'])
|
||||||
|
|
||||||
events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
|
events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
|
||||||
|
@ -222,14 +231,13 @@ def create_events_table():
|
||||||
events_theme = order_columns_id(events_theme)
|
events_theme = order_columns_id(events_theme)
|
||||||
return events_theme
|
return events_theme
|
||||||
|
|
||||||
|
def create_representations_table(directory_path):
|
||||||
def create_representations_table():
|
representations = load_dataset(directory_path, name = "representations")
|
||||||
representations = load_dataset("1representations.csv")
|
|
||||||
representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
|
representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
|
||||||
'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
|
'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
|
||||||
'representation_type_id'])
|
'representation_type_id'])
|
||||||
|
|
||||||
representations_capacity = load_dataset("1representation_category_capacities.csv")
|
representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
|
||||||
representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
|
representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
|
||||||
|
|
||||||
representations_theme = representations.merge(representations_capacity, how='left',
|
representations_theme = representations.merge(representations_capacity, how='left',
|
||||||
|
@ -240,22 +248,27 @@ def create_representations_table():
|
||||||
representations_theme = order_columns_id(representations_theme)
|
representations_theme = order_columns_id(representations_theme)
|
||||||
return representations_theme
|
return representations_theme
|
||||||
|
|
||||||
def uniform_product_df():
|
def uniform_product_df(directory_path):
|
||||||
"""
|
"""
|
||||||
This function returns the uniform product dataset
|
This function returns the uniform product dataset
|
||||||
"""
|
"""
|
||||||
|
products_theme = create_products_table(directory_path)
|
||||||
|
representation_theme = create_representations_table(directory_path)
|
||||||
|
events_theme = create_events_table(directory_path)
|
||||||
|
ticket_information = preprocessing_tickets_area(directory_path)
|
||||||
|
|
||||||
print("Products theme columns : ", products_theme.columns)
|
print("Products theme columns : ", products_theme.columns)
|
||||||
print("\n Representation theme columns : ", representation_theme.columns)
|
print("\n Representation theme columns : ", representation_theme.columns)
|
||||||
print("\n Events theme columns : ", events_theme.columns)
|
print("\n Events theme columns : ", events_theme.columns)
|
||||||
|
|
||||||
products_global = products_theme.merge(representation_theme, how='left',
|
products_global = pd.merge(products_theme, representation_theme, how='left',
|
||||||
on= ["representation_id", "category_id"])
|
on= ["representation_id", "category_id"])
|
||||||
|
|
||||||
products_global = products_global.merge(events_theme, how='left', on='event_id',
|
products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
|
||||||
suffixes = ("_representation", "_event"))
|
suffixes = ("_representation", "_event"))
|
||||||
|
|
||||||
products_global = order_columns_id(products_global)
|
products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
|
||||||
|
|
||||||
|
products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
|
||||||
|
|
||||||
# remove useless columns
|
return products_purchased_reduced
|
||||||
products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'
|
|
||||||
return products_global
|
|
|
@ -1,6 +1,20 @@
|
||||||
# Function de construction de KPI
|
# Function de construction de KPI
|
||||||
|
|
||||||
|
def custom_date_parser(date_string):
|
||||||
|
return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
|
||||||
|
|
||||||
|
def display_databases(directory_path, file_name, datetime_col = None):
|
||||||
|
"""
|
||||||
|
This function returns the file from s3 storage
|
||||||
|
"""
|
||||||
|
file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
|
||||||
|
print("File path : ", file_path)
|
||||||
|
with fs.open(file_path, mode="rb") as file_in:
|
||||||
|
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
||||||
|
return df
|
||||||
|
|
||||||
def campaigns_kpi_function(campaigns_information = None):
|
def campaigns_kpi_function(campaigns_information = None):
|
||||||
|
|
||||||
# Nombre de campagnes de mails
|
# Nombre de campagnes de mails
|
||||||
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
||||||
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
||||||
|
@ -29,23 +43,23 @@ def campaigns_kpi_function(campaigns_information = None):
|
||||||
def tickets_kpi_function(tickets_information = None):
|
def tickets_kpi_function(tickets_information = None):
|
||||||
|
|
||||||
tickets_information_copy = tickets_information.copy()
|
tickets_information_copy = tickets_information.copy()
|
||||||
|
|
||||||
# Dummy : Canal de vente en ligne
|
# Dummy : Canal de vente en ligne
|
||||||
liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
|
liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
|
||||||
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)
|
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)
|
||||||
|
|
||||||
# Proportion de vente en ligne
|
# Proportion de vente en ligne
|
||||||
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()
|
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index()
|
||||||
prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)
|
prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)
|
||||||
|
|
||||||
# Average amount
|
# Average amount
|
||||||
avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
|
# avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
|
||||||
.agg({"amount" : "mean"}).reset_index()
|
# .agg({"amount" : "mean"}).reset_index()
|
||||||
.rename(columns = {'amount' : 'avg_amount'}))
|
# .rename(columns = {'amount' : 'avg_amount'}))
|
||||||
|
|
||||||
|
|
||||||
tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
|
tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
|
||||||
.groupby(['customer_id', 'event_type_id'])
|
.groupby(['customer_id'])
|
||||||
.agg({'ticket_id': 'count',
|
.agg({'ticket_id': 'count',
|
||||||
'purchase_id' : 'nunique',
|
'purchase_id' : 'nunique',
|
||||||
'amount' : 'sum',
|
'amount' : 'sum',
|
||||||
|
@ -61,8 +75,7 @@ def tickets_kpi_function(tickets_information = None):
|
||||||
'purchase_id_nunique' : 'nb_purchases',
|
'purchase_id_nunique' : 'nb_purchases',
|
||||||
'amount_sum' : 'total_amount',
|
'amount_sum' : 'total_amount',
|
||||||
'supplier_name_nunique' : 'nb_suppliers',
|
'supplier_name_nunique' : 'nb_suppliers',
|
||||||
'customer_id_' : 'customer_id',
|
'customer_id_' : 'customer_id'}, inplace = True)
|
||||||
'event_type_id_' : 'event_type_id'}, inplace = True)
|
|
||||||
|
|
||||||
tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
|
tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
|
||||||
tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
|
tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
|
||||||
|
@ -73,10 +86,10 @@ def tickets_kpi_function(tickets_information = None):
|
||||||
tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
|
tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
|
||||||
|
|
||||||
|
|
||||||
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')
|
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
|
||||||
tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)
|
tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)
|
||||||
|
|
||||||
tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
|
# tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
|
||||||
|
|
||||||
return tickets_kpi
|
return tickets_kpi
|
||||||
|
|
||||||
|
|
|
@ -615,19 +615,15 @@
|
||||||
"FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
|
"FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
|
||||||
"\n",
|
"\n",
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
" purchases = pd.read_csv(file_in, sep=\",\")\n",
|
" purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'])\n",
|
||||||
" \n",
|
" \n",
|
||||||
"purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True, format = 'ISO8601')\n",
|
|
||||||
"\n",
|
|
||||||
"# Emails\n",
|
"# Emails\n",
|
||||||
"BUCKET = \"projet-bdc2324-team1\"\n",
|
"BUCKET = \"projet-bdc2324-team1\"\n",
|
||||||
"FILE_KEY_S3 = \"0_Temp/Company 1 - Campaigns dataset clean.csv\"\n",
|
"FILE_KEY_S3 = \"0_Temp/Company 1 - Campaigns dataset clean.csv\"\n",
|
||||||
"FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
|
"FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
|
||||||
"\n",
|
"\n",
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
" campaigns = pd.read_csv(file_in, sep=\",\")\n",
|
" campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = ['sent_at'])\n"
|
||||||
"\n",
|
|
||||||
"campaigns['sent_at'] = pd.to_datetime(campaigns['sent_at'], utc = True, format = 'ISO8601')\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -818,7 +814,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 33,
|
"execution_count": 16,
|
||||||
"id": "f663d68b-8a5c-4804-b31a-4477a03ca1e4",
|
"id": "f663d68b-8a5c-4804-b31a-4477a03ca1e4",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
|
@ -906,7 +902,7 @@
|
||||||
"max 641981.000000 1.256574e+06"
|
"max 641981.000000 1.256574e+06"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 33,
|
"execution_count": 16,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -917,7 +913,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 17,
|
||||||
"id": "d1212b10-3933-450a-b001-9e2cbf308f79",
|
"id": "d1212b10-3933-450a-b001-9e2cbf308f79",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1219,7 +1215,7 @@
|
||||||
"[1826672 rows x 15 columns]"
|
"[1826672 rows x 15 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 16,
|
"execution_count": 17,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1238,7 +1234,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": 18,
|
||||||
"id": "dc45c1cd-2a78-48a6-aa2b-6a501254b6f2",
|
"id": "dc45c1cd-2a78-48a6-aa2b-6a501254b6f2",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1458,7 +1454,7 @@
|
||||||
"[5 rows x 40 columns]"
|
"[5 rows x 40 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 17,
|
"execution_count": 18,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1478,7 +1474,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 19,
|
||||||
"id": "89fcb455-efb4-4ad4-ab88-efd6c8a76287",
|
"id": "89fcb455-efb4-4ad4-ab88-efd6c8a76287",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1499,7 +1495,7 @@
|
||||||
" dtype='object')"
|
" dtype='object')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 18,
|
"execution_count": 19,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1510,7 +1506,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": 20,
|
||||||
"id": "d7b2356a-d5fc-4547-b3ff-fded0e304fb6",
|
"id": "d7b2356a-d5fc-4547-b3ff-fded0e304fb6",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1634,7 +1630,7 @@
|
||||||
"9 0.0 "
|
"9 0.0 "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 19,
|
"execution_count": 20,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1653,7 +1649,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": 21,
|
||||||
"id": "5559748f-1745-4651-a9f6-94702c7ee66f",
|
"id": "5559748f-1745-4651-a9f6-94702c7ee66f",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1813,7 +1809,7 @@
|
||||||
"max 434.000000 "
|
"max 434.000000 "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 20,
|
"execution_count": 21,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1835,7 +1831,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 21,
|
"execution_count": 22,
|
||||||
"id": "4971e35d-a762-4e18-9443-fd9571bd3f1e",
|
"id": "4971e35d-a762-4e18-9443-fd9571bd3f1e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1864,7 +1860,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": 23,
|
||||||
"id": "bc65a711-d172-4839-b487-3047280fc3a6",
|
"id": "bc65a711-d172-4839-b487-3047280fc3a6",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1894,7 +1890,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": 24,
|
||||||
"id": "c95cc35c-abfc-47c7-9b8a-ac69bfd60dd8",
|
"id": "c95cc35c-abfc-47c7-9b8a-ac69bfd60dd8",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1922,7 +1918,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 24,
|
"execution_count": 25,
|
||||||
"id": "49d5fd2d-9bc1-43ac-9270-1efd73759854",
|
"id": "49d5fd2d-9bc1-43ac-9270-1efd73759854",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1967,7 +1963,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 25,
|
"execution_count": 26,
|
||||||
"id": "e50e2583-4b8f-478e-87ac-591dde200af8",
|
"id": "e50e2583-4b8f-478e-87ac-591dde200af8",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1988,7 +1984,7 @@
|
||||||
" dtype='object')"
|
" dtype='object')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 25,
|
"execution_count": 26,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1999,7 +1995,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 26,
|
"execution_count": 27,
|
||||||
"id": "c724a315-9fe8-4874-be8f-a8115b17b5e2",
|
"id": "c724a315-9fe8-4874-be8f-a8115b17b5e2",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -2021,7 +2017,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 27,
|
"execution_count": 28,
|
||||||
"id": "58af5dcb-673e-4f4d-ad5c-f66ce1e8a22c",
|
"id": "58af5dcb-673e-4f4d-ad5c-f66ce1e8a22c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -2042,7 +2038,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 28,
|
"execution_count": 29,
|
||||||
"id": "cc3437f7-8b36-4398-9da6-ff15e8e4c8d7",
|
"id": "cc3437f7-8b36-4398-9da6-ff15e8e4c8d7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
|
|
@ -1,695 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Business Data Challenge - Team 1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Configuration de l'accès aux données"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"# Create filesystem object\n",
|
|
||||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
||||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
|
||||||
"\n",
|
|
||||||
"BUCKET = \"bdc2324-data\"\n",
|
|
||||||
"fs.ls(BUCKET)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chargement des fichiers campaign_stats.csv\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Conversion des dates 'sent_at'\n",
|
|
||||||
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
|
|
||||||
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
|
|
||||||
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
|
|
||||||
"print(campaign_stats_1['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_1['sent_at'].min())\n",
|
|
||||||
"\n",
|
|
||||||
"print(campaign_stats_2['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_2['sent_at'].min())\n",
|
|
||||||
"\n",
|
|
||||||
"print(campaign_stats_3['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_3['sent_at'].min())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"campaign_stats_1['sent_at']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "31f2edbf-5661-4516-9835-06d4da615c13",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Customersplus.csv"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "460f853a-68c0-42a7-9877-b83d3aaec813",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.columns"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.shape"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1['id'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_2['id'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "b40a653e-013f-48d0-8b57-0284587b36c5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "32fa2215-3c79-40b5-8643-755865959fc7",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
|
|
||||||
"# Exemple id commun = caractéristiques communes\n",
|
|
||||||
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
|
|
||||||
"\n",
|
|
||||||
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.isna().mean()*100"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "6f6ce60d-0912-497d-9108-330acccef394",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chargement de toutes les données\n",
|
|
||||||
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
|
|
||||||
"\n",
|
|
||||||
"for nom_base in liste_base:\n",
|
|
||||||
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
|
|
||||||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Jointure\n",
|
|
||||||
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
|
|
||||||
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
|
|
||||||
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
|
|
||||||
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
|
|
||||||
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
|
|
||||||
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
|
|
||||||
"df_customer_event"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Fusion et exploration"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "22bfad2b-d52a-4077-9b39-bee35004e01c",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Jointure\n",
|
|
||||||
"var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
|
|
||||||
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
|
|
||||||
"\n",
|
|
||||||
"var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
|
|
||||||
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
|
|
||||||
"\n",
|
|
||||||
"var_choosed.remove('representation_id')\n",
|
|
||||||
"var_choosed.extend(['start_date_time', 'event_id'])\n",
|
|
||||||
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
|
|
||||||
"\n",
|
|
||||||
"var_choosed.remove('event_id')\n",
|
|
||||||
"var_choosed.extend(['name', 'customer_id'])\n",
|
|
||||||
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
|
|
||||||
"\n",
|
|
||||||
"# Changement de nom\n",
|
|
||||||
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
|
|
||||||
"var_choosed[var_choosed.index('name')] = \"event_name\"\n",
|
|
||||||
"\n",
|
|
||||||
"# Base finale\n",
|
|
||||||
"var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
|
|
||||||
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
|
|
||||||
"df_customer_event"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Type de client au globale"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Client\n",
|
|
||||||
"print(customer_target_mappings.columns)\n",
|
|
||||||
"print(customer_target_mappings.shape)\n",
|
|
||||||
"customer_target_mappings.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customer_target_mappings['extra_field'].unique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customer_target_mappings['name'].unique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Segmentation existante\n",
|
|
||||||
"print(target_types.columns)\n",
|
|
||||||
"print(target_types.shape)\n",
|
|
||||||
"target_types.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "5adb1773-648d-4683-bc08-d1f2298c1283",
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"target_types"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "3d65f74e-47fc-4296-b493-a1ebefb91cde",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Tags = clients\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" tags = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(tags.columns)\n",
|
|
||||||
"print(tags.shape)\n",
|
|
||||||
"tags.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "8a689a63-165b-4c4e-bbb0-695b661048d9",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"tags"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "69e38c52-0570-4531-aebb-9deb6db8c40b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Structure = clients\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(structure_tag_mappings.columns)\n",
|
|
||||||
"print(structure_tag_mappings.shape)\n",
|
|
||||||
"structure_tag_mappings.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "74dc34ad-375b-48df-a900-40d92c5fff13",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"structure_tag_mappings"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Tags = clients\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customersplus = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(customersplus.columns)\n",
|
|
||||||
"print(customersplus.shape)\n",
|
|
||||||
"customersplus.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "383e892c-606a-45ce-bdd6-b503b3e0be33",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customersplus"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "70324d06-b855-4386-a7de-eef1eb13dfdf",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# tickets\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" tickets = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(tickets.columns)\n",
|
|
||||||
"print(tickets.shape)\n",
|
|
||||||
"tickets.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "ea83ea5c-3d47-4a66-a523-04b69b149a20",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"tickets"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"tickets['type_of'].unique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "bc192b08-30a5-486a-8bea-93e765dbfce6",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Types d'évenement et client"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "e14dcf62-2def-4ed5-834b-cf21abbc2894",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Evenement = events.csv\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" events = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(events.columns)\n",
|
|
||||||
"print(events.shape)\n",
|
|
||||||
"events.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"events"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "af80eee8-f717-4159-a0fd-09d47ec96621",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"events['name'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Représentation des évenements = representations.csv\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" representations = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(representations.columns)\n",
|
|
||||||
"print(representations.shape)\n",
|
|
||||||
"representations.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "1487402a-a49b-4737-b7d7-40c764d2f0b4",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"representations"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "99b27418-2c15-4a6e-bcf5-d329ca492085",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Produits vendues = products.csv\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" products = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(products.columns)\n",
|
|
||||||
"print(products.shape)\n",
|
|
||||||
"products.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "c49bcd47-672f-4e0f-aee9-a7475151b97f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"products"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "a4aec5ce-d0c9-4625-bb29-9ac154818621",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Lieu = facilities.csv\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" facilities = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(facilities.columns)\n",
|
|
||||||
"print(facilities.shape)\n",
|
|
||||||
"facilities.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "b3642483-2879-442a-ad69-efcd2331a200",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"facilities"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "da1e9807-2a8d-4be7-a785-55cffd734f36",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Saisons = seasons.csv période sur deux années consécutives\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" seasons = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(seasons.columns)\n",
|
|
||||||
"print(seasons.shape)\n",
|
|
||||||
"seasons.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"seasons['name'].unique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "abb3aa20-774b-4761-983a-df5eb2bc51c6",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Achats = purchases.csv \n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" purchases = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"print(purchases.columns)\n",
|
|
||||||
"print(purchases.shape)\n",
|
|
||||||
"purchases.info()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "30e204ab-4f63-430c-a818-5c8035b6e17b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"purchases"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
|
@ -124,9 +124,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "e855f403",
|
"id": "e855f403",
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"jp-MarkdownHeadingCollapsed": true
|
|
||||||
},
|
|
||||||
"source": [
|
"source": [
|
||||||
"## customersplus.csv"
|
"## customersplus.csv"
|
||||||
]
|
]
|
||||||
|
@ -1289,7 +1287,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.13"
|
"version": "3.11.6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user