Merge branch 'data_construction'

2024-02-20 22:46:14 +00:00 · 2024-02-20 22:46:14 +00:00 · 29eafcc6b2
commit 29eafcc6b2
parent 1f0892434f 2629502a08
8 changed files with 348 additions and 1030 deletions
--- a/0_1_Input_cleaning.py
+++ b/0_1_Input_cleaning.py
@ -0,0 +1,58 @@
+# Business Data Challenge - Team 1
+
+import pandas as pd
+import numpy as np
+import os
+import s3fs
+import re
+import warnings
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+# Import cleaning and merge functions
+exec(open('0_Cleaning_and_merge_functions.py').read())
+
+# Output folder 
+BUCKET_OUT = "projet-bdc2324-team1"
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+
+
+def export_dataset(df, output_name):
+    print('Exportation of dataset :', output_name)
+    FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + output_name
+    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+        df.to_csv(file_out, index = False)
+
+## 1 - Cleaning of the datasets
+for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"): 
+    # Cleaning customerplus
+    df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
+    
+    ## Exportation 
+    export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv")
+    
+    # Cleaning target area
+    df1_target_information = preprocessing_target_area(directory_path = tenant_id)
+    ## Exportation 
+    export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv")
+    
+    # Cleaning campaign area
+    df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id)
+    ## Exportation 
+    export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
+    
+    ## Exportation 
+    # export_dataset(df = df1_campaigns_information, output_name = "0_Temp/Company 1 - Campaigns dataset clean.csv")
+    
+    # Cleaning product area
+    df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
+    ## Exportation 
+    export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
+    #Exportation 
+    # export_dataset(df = df1_products_purchased_reduced, output_name = "0_Temp/Company 1 - Purchases.csv")
+
+    print("\n ------------------------------------------------------------------ \n --------------------- END CLEANING COMPANY " + tenant_id + " --------------------- \n ------------------------------------------------------------------")
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@ -0,0 +1,128 @@
+# Business Data Challenge - Team 1
+
+import pandas as pd
+import numpy as np
+import os
+import s3fs
+import re
+import warnings
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+
+# Import cleaning and merge functions
+exec(open('0_KPI_functions.py').read())
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+
+def dataset_construction(min_date, end_features_date, max_date, directory_path):
+    
+    # Import customerplus
+    df_customerplus_clean = display_databases(directory_path, file_name = "customerplus_cleaned")
+    df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
+    df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
+    
+    # Filtre de cohérence pour la mise en pratique de notre méthode
+    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') 
+    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
+    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
+
+    #Filtre de la base df_campaigns_information
+    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
+    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
+    
+    #Filtre de la base df_products_purchased_reduced
+    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
+
+    print("Data filtering : SUCCESS")
+    
+    # Fusion de l'ensemble et creation des KPI
+
+    # KPI sur les campagnes publicitaires
+    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) 
+
+    # KPI sur le comportement d'achat
+    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
+
+    # KPI sur les données socio-demographique    
+
+    ## Le genre
+    df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
+        0: 'female',
+        1: 'male',
+        2: 'other'
+    })
+    gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
+    df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
+
+    ## Indicatrice si individue vit en France
+    df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
+
+    print("KPIs construction : SUCCESS")
+    
+    # Fusion avec KPI liés au customer
+    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
+    
+    # Fill NaN values
+    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
+    
+    # Fusion avec KPI liés au comportement d'achat
+    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
+    
+    # Fill NaN values
+    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
+
+    print("Explanatory variable construction : SUCCESS")
+
+    # 2. Construction of the explained variable 
+    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
+
+    # Indicatrice d'achat
+    df_products_purchased_to_predict['y_has_purchased'] = 1
+
+    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
+
+    print("Explained variable construction : SUCCESS")
+    
+    # 3. Merge between explained and explanatory variables
+    dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
+
+    # 0 if there is no purchase
+    dataset[['y_has_purchased']].fillna(0)    
+    
+    return dataset
+
+## Exportation
+
+# Dossier d'exportation
+BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
+
+# Dataset test
+dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1")
+
+# # Exportation
+# FILE_KEY_OUT_S3 = "dataset_test.csv"
+# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
+
+# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+#     dataset_test.to_csv(file_out, index = False)
+
+# print("Exportation dataset test : SUCCESS")
+
+# Dataset train
+dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1")
+
+# Exportation
+# FILE_KEY_OUT_S3 = "dataset_train.csv"
+# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
+
+# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+#     dataset_train.to_csv(file_out, index = False)
+    
+# print("Exportation dataset train : SUCCESS")
+
+
+print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
--- a/0_Cleaning_and_merge.py
+++ b/0_Cleaning_and_merge.py
@ -1,193 +0,0 @@
-# Business Data Challenge - Team 1
-
-import pandas as pd
-import numpy as np
-import os
-import s3fs
-import re
-import warnings
-
-# Import cleaning and merge functions
-exec(open('BDC-team-1/0_Cleaning_and_merge_functions.py').read())
-exec(open('BDC-team-1/0_KPI_functions.py').read())
-
-# Create filesystem object
-S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
-fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
-
-# Ignore warning
-warnings.filterwarnings('ignore')
-
-# Data loading
-BUCKET = "bdc2324-data/1"
-liste_database = fs.ls(BUCKET)
-
-# loop to create dataframes from liste
-client_number = liste_database[0].split("/")[1]
-df_prefix = "df" + str(client_number) + "_"
-
-for i in range(len(liste_database)) :
-    current_path = liste_database[i]
-    with fs.open(current_path, mode="rb") as file_in:
-        df = pd.read_csv(file_in)
-        # the pattern of the name is df1xxx
-        nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
-        globals()[nom_dataframe] = df
-
-## 1 - Cleaning of the datasets
-
-# Cleaning customerplus
-df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)
-
-# Cleaning target area
-df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)
-
-# Cleaning campaign area
-df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)
-
-# Exportation 
-BUCKET_OUT = "projet-bdc2324-team1"
-FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Campaigns dataset clean.csv"
-FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
-
-with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
-    df1_campaigns_information.to_csv(file_out, index = False)
-## Cleaning product area
-
-# Cleaning ticket area
-df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)
-
-
-BUCKET = "bdc2324-data"
-directory_path = '1'
-
-products_theme = create_products_table()
-events_theme= create_events_table()
-representation_theme = create_representations_table()
-products_global = uniform_product_df()
-
-# Fusion liée au product
-df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
-
-# Selection des variables d'intérêts
-df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
-
-#Exportation 
-BUCKET_OUT = "projet-bdc2324-team1"
-FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Purchases.csv"
-FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
-
-with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
-    df1_products_purchased_reduced.to_csv(file_out, index = False)
-
-## 2 - Construction of KPIs on a given period
-
-def explanatory_variables(min_date, max_date, df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean):
-
-    # Filtre de cohérence pour la mise en pratique de notre méthode
-    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') 
-    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
-
-    #Filtre de la base df_campaigns_information
-    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= max_date) & (df_campaigns_information['sent_at'] >= min_date)]
-    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= max_date] = np.datetime64('NaT')
-       
-    #Filtre de la base df_products_purchased_reduced
-    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
-
-    print("Data filtering : SUCCESS")
-    
-    # Fusion de l'ensemble et creation des KPI
-
-    # KPI sur les campagnes publicitaires
-    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) 
-
-    # KPI sur le comportement d'achat
-    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
-
-    # KPI sur les données socio-demographique    
-
-    ## Le genre
-    df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
-        0: 'female',
-        1: 'male',
-        2: 'other'
-    })
-    gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
-    df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
-
-    ## Indicatrice si individue vit en France
-    df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
-
-    print("KPIs construction : SUCCESS")
-    
-    # Fusion avec KPI liés au customer
-    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
-    
-    # Fill NaN values
-    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
-    
-    # Fusion avec KPI liés au comportement d'achat
-    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
-    
-    # Fill NaN values
-    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
-
-    print("Explanatory variable construction : SUCCESS")
-    
-    return df_customer_product
-
-# Fonction pour créer les variables expliquée
-def explained_variable(min_date, max_date, df_products_purchased_reduced = df1_products_purchased_reduced):
-
-    # Filtrer la base d'achat
-    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > min_date)]
-
-    # Indicatrice d'achat
-    df_products_purchased_reduced['y_has_purchased'] = 1
-
-    y = df_products_purchased_reduced[['customer_id', 'event_type_id', 'y_has_purchased']].drop_duplicates()
-
-    print("Explained variable construction : SUCCESS")
-    
-    return y
-
-## Exportation
-
-# Dossier d'exportation
-BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
-
-# Dataset test
-X_test = explanatory_variables(min_date = "2021-08-01", max_date = "2023-08-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
-
-y_test = explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced)
-
-dataset_test = pd.merge(X_test, y_test, on = ['customer_id', 'event_type_id'], how = 'left')
-
-# Exportation
-FILE_KEY_OUT_S3 = "dataset_test.csv"
-FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
-
-with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
-    dataset_test.to_csv(file_out, index = False)
-
-print("Exportation dataset test : SUCCESS")
-
-# Dataset train
-X_train = explanatory_variables(min_date = "2021-05-01", max_date = "2023-05-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
-
-y_train = explained_variable(min_date = "2023-05-01", max_date = "2023-08-01", df_products_purchased_reduced = df1_products_purchased_reduced)
-
-dataset_train = pd.merge(X_train, y_train, on = ['customer_id', 'event_type_id'], how = 'left')
-
-# Exportation
-FILE_KEY_OUT_S3 = "dataset_train.csv"
-FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
-
-with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
-    dataset_train.to_csv(file_out, index = False)
-    
-print("Exportation dataset train : SUCCESS")
-
-
-print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
--- a/0_Cleaning_and_merge_functions.py
+++ b/0_Cleaning_and_merge_functions.py
@ -1,38 +1,92 @@
-# Cleaning and merge functions
+#### Cleaning and merge functions ####

-# Cleaning function
+BUCKET = "bdc2324-data"
+
+# 1. Basic cleaning functions
 def cleaning_date(df, column_name):
    """
-    Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.
-
-    Parameters:
-    - df: DataFrame
-        Le DataFrame contenant la colonne à nettoyer.
-    - column_name: str
-        Le nom de la colonne à nettoyer.
-
-    Returns:
-    - DataFrame
-        Le DataFrame modifié avec la colonne nettoyée.
+    Datetime columns cleaning with ISO format
    """
    df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')
    return df

-def preprocessing_customerplus(customerplus = None):
+def display_databases(directory_path, file_name):
+    """
+    This function returns the file from s3 storage
+    """
+    file_path = BUCKET + "/" + directory_path + "/" + directory_path + file_name + ".csv"
+    print("File path : ", file_path)
+    with fs.open(file_path, mode="rb") as file_in:
+        df = pd.read_csv(file_in, sep=",")
        
-    customerplus_copy = customerplus.copy()
+    print("Shape : ", df.shape)
+    return df
+
+def remove_horodates(df):
+    """
+    this function remove horodate columns like created_at and updated_at
+    """
+    df = df.drop(columns = ["created_at", "updated_at"])
+    return df
+
+def order_columns_id(df):
+    """
+    this function puts all id columns at the beginning in order to read the dataset easier
+    """
+    substring = 'id'
+    id_columns = [col for col in df.columns if substring in col]
+    remaining_col = [col for col in df.columns if substring not in col]
+    new_order = id_columns + remaining_col
+    return df[new_order]
+
+def process_df_2(df):
+    """
+    This function organizes dataframe
+    """
+    df = remove_horodates(df)
+    print("Number of columns : ", len(df.columns))
+    df = order_columns_id(df)
+    print("Columns : ", df.columns)
+    return df
+
+def load_dataset(directory_path, name):
+    """
+    This function loads csv file
+    """
+    df = display_databases(directory_path, file_name = name)
+    df = process_df_2(df)
+    # drop na :
+    #df = df.dropna(axis=1, thresh=len(df))
+    # if identifier in table : delete it
+    if 'identifier' in df.columns:
+        df = df.drop(columns = 'identifier')
+    return df
+
+
+# 2. Creation of cleaned and merged datasets
+
+def preprocessing_customerplus(directory_path):
+
+    customerplus_copy = load_dataset(directory_path, name = "customersplus")
    
    # Passage en format date
    cleaning_date(customerplus_copy, 'first_buying_date')
    cleaning_date(customerplus_copy, 'last_visiting_date')
    
    # Selection des variables
-    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
+    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
    customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)

    return customerplus_copy

-def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):
+def preprocessing_tickets_area(directory_path):
+
+    # Datasets loading
+    tickets = load_dataset(directory_path, name = "tickets")
+    purchases = load_dataset(directory_path, name = "purchases")
+    suppliers = load_dataset(directory_path, name = "suppliers")
+    type_ofs = load_dataset(directory_path, name = "type_ofs")
+    
    # Base des tickets
    tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
    tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
@ -48,7 +102,7 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non

    # Base des achats
    # Nettoyage de la date d'achat
-    cleaning_date(purchases, 'purchase_date')
+    # cleaning_date(purchases, 'purchase_date')
    # Selection des variables
    purchases = purchases[['id', 'purchase_date', 'customer_id']]

@ -67,8 +121,13 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non

    return ticket_information

-def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):
-    # Target.csv cleaning
+def preprocessing_target_area(directory_path):
+
+    # Datasets loading
+    targets = load_dataset(directory_path, name = "targets")
+    target_types = load_dataset(directory_path, name = "target_types")
+    customer_target_mappings = load_dataset(directory_path, name = "customer_target_mappings")
+    # target cleaning
    targets = targets[["id", "target_type_id", "name"]]
    targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)
    
@ -88,16 +147,21 @@ def preprocessing_target_area(targets = None, target_types = None, customer_targ

    return targets_full

-def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
+def preprocessing_campaigns_area(directory_path):
+
+    # Datasets loading
+    campaign_stats = load_dataset(directory_path, name = "campaign_stats")
+    campaigns = load_dataset(directory_path, name = "campaigns")
+    
    # campaign_stats cleaning 
    campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]]
-    cleaning_date(campaign_stats, 'opened_at')
-    cleaning_date(campaign_stats, 'sent_at')
-    cleaning_date(campaign_stats, 'delivered_at')
+    # cleaning_date(campaign_stats, 'opened_at')
+    # cleaning_date(campaign_stats, 'sent_at')
+    # cleaning_date(campaign_stats, 'delivered_at')
    
    # campaigns cleaning
    campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_")
-    cleaning_date(campaigns, 'campaign_sent_at')
+    # cleaning_date(campaigns, 'campaign_sent_at')
    
    # Merge 
    campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left")
@ -105,66 +169,11 @@ def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):

    return campaigns_full

-def display_databases(file_name):
-    """
-    This function returns the file from s3 storage
-    """
-    file_path = BUCKET + "/" + directory_path + "/" + file_name
-    print("File path : ", file_path)
-    with fs.open(file_path, mode="rb") as file_in:
-        df = pd.read_csv(file_in, sep=",")
-        
-    print("Shape : ", df.shape)
-    return df
-
-
-def remove_horodates(df):
-    """
-    this function remove horodate columns like created_at and updated_at
-    """
-    df = df.drop(columns = ["created_at", "updated_at"])
-    return df
-
-
-def order_columns_id(df):
-    """
-    this function puts all id columns at the beginning in order to read the dataset easier
-    """
-    substring = 'id'
-    id_columns = [col for col in df.columns if substring in col]
-    remaining_col = [col for col in df.columns if substring not in col]
-    new_order = id_columns + remaining_col
-    return df[new_order]
-
-
-def process_df_2(df):
-    """
-    This function organizes dataframe
-    """
-    df = remove_horodates(df)
-    print("Number of columns : ", len(df.columns))
-    df = order_columns_id(df)
-    print("Columns : ", df.columns)
-    return df
-
-def load_dataset(name):
-    """
-    This function loads csv file
-    """
-    df = display_databases(name)
-    df = process_df_2(df)
-    # drop na :
-    #df = df.dropna(axis=1, thresh=len(df))
-    # if identifier in table : delete it
-    if 'identifier' in df.columns:
-        df = df.drop(columns = 'identifier')
-    return df
-
-def create_products_table():
+def create_products_table(directory_path):
    # first merge products and categories
    print("first merge products and categories")
-    products = load_dataset("1products.csv")
-    categories = load_dataset("1categories.csv")
+    products = load_dataset(directory_path, name = "products")
+    categories = load_dataset(directory_path, name = "categories")
    # Drop useless columns
    products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])
    categories = categories.drop(columns = ['extra_field', 'quota'])
@ -176,7 +185,7 @@ def create_products_table():
    
    # Second merge products_theme and type of categories
    print("Second merge products_theme and type of categories")
-    type_of_categories = load_dataset("1type_of_categories.csv")
+    type_of_categories = load_dataset(directory_path, name = "type_of_categories")
    type_of_categories = type_of_categories.drop(columns = 'id')
    products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
                                          right_on = 'category_id' )
@ -187,11 +196,11 @@ def create_products_table():
    return products_theme


-def create_events_table():
+def create_events_table(directory_path):
    # first merge events and seasons : 
    print("first merge events and seasons : ")
-    events = load_dataset("1events.csv")
-    seasons = load_dataset("1seasons.csv")
+    events = load_dataset(directory_path, name = "events")
+    seasons = load_dataset(directory_path, name = "seasons")

    # Drop useless columns
    events = events.drop(columns = ['manual_added', 'is_display'])
@ -201,7 +210,7 @@ def create_events_table():

    # Secondly merge events_theme and event_types
    print("Secondly merge events_theme and event_types : ")
-    event_types = load_dataset("1event_types.csv")
+    event_types = load_dataset(directory_path, name = "event_types")
    event_types = event_types.drop(columns = ['fidelity_delay'])
    
    events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
@ -210,7 +219,7 @@ def create_events_table():

    # thirdly merge events_theme and facilities
    print("thirdly merge events_theme and facilities : ")
-    facilities = load_dataset("1facilities.csv")
+    facilities = load_dataset(directory_path, name = "facilities")
    facilities = facilities.drop(columns = ['fixed_capacity'])
    
    events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
@ -222,14 +231,13 @@ def create_events_table():
    events_theme  = order_columns_id(events_theme)
    return events_theme

-
-def create_representations_table():
-    representations = load_dataset("1representations.csv")
+def create_representations_table(directory_path):
+    representations = load_dataset(directory_path, name = "representations")
    representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
                                                     'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
                                                     'representation_type_id'])
    
-    representations_capacity = load_dataset("1representation_category_capacities.csv")
+    representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
    representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])

    representations_theme = representations.merge(representations_capacity, how='left',
@ -240,22 +248,27 @@ def create_representations_table():
    representations_theme = order_columns_id(representations_theme)
    return representations_theme

-def uniform_product_df():
+def uniform_product_df(directory_path):
    """
    This function returns the uniform product dataset
    """
+    products_theme = create_products_table(directory_path)
+    representation_theme = create_representations_table(directory_path)
+    events_theme = create_events_table(directory_path)
+    ticket_information = preprocessing_tickets_area(directory_path)
+
    print("Products theme columns : ", products_theme.columns)
    print("\n Representation theme columns : ", representation_theme.columns)
    print("\n Events theme columns : ", events_theme.columns)

-    products_global = products_theme.merge(representation_theme, how='left',
+    products_global = pd.merge(products_theme, representation_theme, how='left',
                                           on= ["representation_id", "category_id"])
    
-    products_global = products_global.merge(events_theme, how='left', on='event_id',
+    products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
                                            suffixes = ("_representation", "_event"))
    
-    products_global = order_columns_id(products_global)
+    products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
    
-    # remove useless columns 
-    products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'
-    return products_global
+    products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
+
+    return products_purchased_reduced
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@ -1,6 +1,20 @@
 # Function de construction de KPI

+def custom_date_parser(date_string):
+    return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
+
+def display_databases(directory_path, file_name, datetime_col = None):
+    """
+    This function returns the file from s3 storage 
+    """
+    file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
+    print("File path : ", file_path)
+    with fs.open(file_path, mode="rb") as file_in:
+        df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)        
+    return df       
+
 def campaigns_kpi_function(campaigns_information = None):
+     
    # Nombre de campagnes de mails
    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
@ -35,17 +49,17 @@ def tickets_kpi_function(tickets_information = None):
    tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)

    # Proportion de vente en ligne
-    prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()
+    prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index()
    prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)

    # Average amount
-    avg_amount =  (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
-              .agg({"amount" : "mean"}).reset_index()
-              .rename(columns = {'amount' : 'avg_amount'}))
+    # avg_amount =  (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
+    #           .agg({"amount" : "mean"}).reset_index()
+    #           .rename(columns = {'amount' : 'avg_amount'}))

    
-    tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id',  'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
-                   .groupby(['customer_id', 'event_type_id']) 
+    tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
+                   .groupby(['customer_id']) 
                   .agg({'ticket_id': 'count', 
                         'purchase_id' : 'nunique',
                         'amount' : 'sum',
@ -61,8 +75,7 @@ def tickets_kpi_function(tickets_information = None):
                                  'purchase_id_nunique' : 'nb_purchases',
                                  'amount_sum' : 'total_amount',
                                  'supplier_name_nunique' : 'nb_suppliers', 
-                                  'customer_id_' : 'customer_id',
-                                  'event_type_id_' : 'event_type_id'}, inplace = True)
+                                  'customer_id_' : 'customer_id'}, inplace = True)
    
    tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
    tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
@ -73,10 +86,10 @@ def tickets_kpi_function(tickets_information = None):
    tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')

    
-    tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')
+    tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
    tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)

-    tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
+    # tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')

    return tickets_kpi

--- a/1_Descriptive_Statistics.ipynb
+++ b/1_Descriptive_Statistics.ipynb
@ -615,19 +615,15 @@
    "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    purchases = pd.read_csv(file_in, sep=\",\")\n",
+    "    purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'])\n",
    "    \n",
-    "purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True, format = 'ISO8601')\n",
-    "\n",
    "# Emails\n",
    "BUCKET = \"projet-bdc2324-team1\"\n",
    "FILE_KEY_S3 = \"0_Temp/Company 1 - Campaigns dataset clean.csv\"\n",
    "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaigns = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "campaigns['sent_at'] = pd.to_datetime(campaigns['sent_at'], utc = True, format = 'ISO8601')\n"
+    "    campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = ['sent_at'])\n"
   ]
  },
  {
@ -818,7 +814,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 16,
   "id": "f663d68b-8a5c-4804-b31a-4477a03ca1e4",
   "metadata": {
    "scrolled": true
@ -906,7 +902,7 @@
       "max    641981.000000  1.256574e+06"
      ]
     },
-     "execution_count": 33,
+     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -917,7 +913,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
   "id": "d1212b10-3933-450a-b001-9e2cbf308f79",
   "metadata": {},
   "outputs": [
@ -1219,7 +1215,7 @@
       "[1826672 rows x 15 columns]"
      ]
     },
-     "execution_count": 16,
+     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1238,7 +1234,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
   "id": "dc45c1cd-2a78-48a6-aa2b-6a501254b6f2",
   "metadata": {},
   "outputs": [
@ -1458,7 +1454,7 @@
       "[5 rows x 40 columns]"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1478,7 +1474,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
   "id": "89fcb455-efb4-4ad4-ab88-efd6c8a76287",
   "metadata": {},
   "outputs": [
@ -1499,7 +1495,7 @@
       "      dtype='object')"
      ]
     },
-     "execution_count": 18,
+     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1510,7 +1506,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
   "id": "d7b2356a-d5fc-4547-b3ff-fded0e304fb6",
   "metadata": {},
   "outputs": [
@ -1634,7 +1630,7 @@
       "9                  0.0  "
      ]
     },
-     "execution_count": 19,
+     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1653,7 +1649,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
   "id": "5559748f-1745-4651-a9f6-94702c7ee66f",
   "metadata": {},
   "outputs": [
@ -1813,7 +1809,7 @@
       "max             434.000000  "
      ]
     },
-     "execution_count": 20,
+     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1835,7 +1831,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
   "id": "4971e35d-a762-4e18-9443-fd9571bd3f1e",
   "metadata": {},
   "outputs": [
@ -1864,7 +1860,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
   "id": "bc65a711-d172-4839-b487-3047280fc3a6",
   "metadata": {},
   "outputs": [
@ -1894,7 +1890,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
   "id": "c95cc35c-abfc-47c7-9b8a-ac69bfd60dd8",
   "metadata": {},
   "outputs": [
@ -1922,7 +1918,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
   "id": "49d5fd2d-9bc1-43ac-9270-1efd73759854",
   "metadata": {},
   "outputs": [
@ -1967,7 +1963,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
   "id": "e50e2583-4b8f-478e-87ac-591dde200af8",
   "metadata": {},
   "outputs": [
@ -1988,7 +1984,7 @@
       "      dtype='object')"
      ]
     },
-     "execution_count": 25,
+     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1999,7 +1995,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 27,
   "id": "c724a315-9fe8-4874-be8f-a8115b17b5e2",
   "metadata": {},
   "outputs": [],
@ -2021,7 +2017,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 28,
   "id": "58af5dcb-673e-4f4d-ad5c-f66ce1e8a22c",
   "metadata": {},
   "outputs": [
@ -2042,7 +2038,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 29,
   "id": "cc3437f7-8b36-4398-9da6-ff15e8e4c8d7",
   "metadata": {},
   "outputs": [
--- a/Brouillon_AJ.ipynb
+++ b/Brouillon_AJ.ipynb
@ -1,695 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
-   "metadata": {},
-   "source": [
-    "# Business Data Challenge - Team 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
-   "metadata": {},
-   "source": [
-    "Configuration de l'accès aux données"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import s3fs\n",
-    "# Create filesystem object\n",
-    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
-    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
-    "\n",
-    "BUCKET = \"bdc2324-data\"\n",
-    "fs.ls(BUCKET)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chargement des fichiers campaign_stats.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Conversion des dates 'sent_at'\n",
-    "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
-    "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
-    "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
-    "print(campaign_stats_1['sent_at'].max())\n",
-    "print(campaign_stats_1['sent_at'].min())\n",
-    "\n",
-    "print(campaign_stats_2['sent_at'].max())\n",
-    "print(campaign_stats_2['sent_at'].min())\n",
-    "\n",
-    "print(campaign_stats_3['sent_at'].max())\n",
-    "print(campaign_stats_3['sent_at'].min())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "campaign_stats_1['sent_at']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "31f2edbf-5661-4516-9835-06d4da615c13",
-   "metadata": {},
-   "source": [
-    "### Customersplus.csv"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "460f853a-68c0-42a7-9877-b83d3aaec813",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1.columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1['id'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_2['id'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b40a653e-013f-48d0-8b57-0284587b36c5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "32fa2215-3c79-40b5-8643-755865959fc7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
-    "# Exemple id commun = caractéristiques communes\n",
-    "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
-    "\n",
-    "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "customers_plus_1.isna().mean()*100"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6f6ce60d-0912-497d-9108-330acccef394",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chargement de toutes les données\n",
-    "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
-    "\n",
-    "for nom_base in liste_base:\n",
-    "    FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
-    "    with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "        globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "# Jointure\n",
-    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
-    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
-    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
-    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
-    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
-    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
-    "df_customer_event"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f",
-   "metadata": {},
-   "source": [
-    "# Fusion et exploration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "22bfad2b-d52a-4077-9b39-bee35004e01c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Jointure\n",
-    "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
-    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
-    "\n",
-    "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
-    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
-    "\n",
-    "var_choosed.remove('representation_id')\n",
-    "var_choosed.extend(['start_date_time', 'event_id'])\n",
-    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
-    "\n",
-    "var_choosed.remove('event_id')\n",
-    "var_choosed.extend(['name', 'customer_id'])\n",
-    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
-    "\n",
-    "# Changement de nom\n",
-    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
-    "var_choosed[var_choosed.index('name')] = \"event_name\"\n",
-    "\n",
-    "# Base finale\n",
-    "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
-    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
-    "df_customer_event"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31",
-   "metadata": {},
-   "source": [
-    "## Type de client au globale"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Client\n",
-    "print(customer_target_mappings.columns)\n",
-    "print(customer_target_mappings.shape)\n",
-    "customer_target_mappings.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customer_target_mappings['extra_field'].unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customer_target_mappings['name'].unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Segmentation existante\n",
-    "print(target_types.columns)\n",
-    "print(target_types.shape)\n",
-    "target_types.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5adb1773-648d-4683-bc08-d1f2298c1283",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "target_types"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3d65f74e-47fc-4296-b493-a1ebefb91cde",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Tags = clients\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    tags = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(tags.columns)\n",
-    "print(tags.shape)\n",
-    "tags.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8a689a63-165b-4c4e-bbb0-695b661048d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tags"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "69e38c52-0570-4531-aebb-9deb6db8c40b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Structure = clients\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(structure_tag_mappings.columns)\n",
-    "print(structure_tag_mappings.shape)\n",
-    "structure_tag_mappings.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "74dc34ad-375b-48df-a900-40d92c5fff13",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "structure_tag_mappings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Tags = clients\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customersplus = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(customersplus.columns)\n",
-    "print(customersplus.shape)\n",
-    "customersplus.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "383e892c-606a-45ce-bdd6-b503b3e0be33",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customersplus"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "70324d06-b855-4386-a7de-eef1eb13dfdf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# tickets\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    tickets = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(tickets.columns)\n",
-    "print(tickets.shape)\n",
-    "tickets.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ea83ea5c-3d47-4a66-a523-04b69b149a20",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tickets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tickets['type_of'].unique()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bc192b08-30a5-486a-8bea-93e765dbfce6",
-   "metadata": {},
-   "source": [
-    "## Types d'évenement et client"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e14dcf62-2def-4ed5-834b-cf21abbc2894",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Evenement = events.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    events = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(events.columns)\n",
-    "print(events.shape)\n",
-    "events.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "events"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "af80eee8-f717-4159-a0fd-09d47ec96621",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "events['name'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Représentation des évenements = representations.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    representations = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(representations.columns)\n",
-    "print(representations.shape)\n",
-    "representations.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1487402a-a49b-4737-b7d7-40c764d2f0b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "representations"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "99b27418-2c15-4a6e-bcf5-d329ca492085",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Produits vendues = products.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    products = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(products.columns)\n",
-    "print(products.shape)\n",
-    "products.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c49bcd47-672f-4e0f-aee9-a7475151b97f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "products"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a4aec5ce-d0c9-4625-bb29-9ac154818621",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Lieu = facilities.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    facilities = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(facilities.columns)\n",
-    "print(facilities.shape)\n",
-    "facilities.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b3642483-2879-442a-ad69-efcd2331a200",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "facilities"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "da1e9807-2a8d-4be7-a785-55cffd734f36",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Saisons = seasons.csv période sur deux années consécutives\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    seasons = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(seasons.columns)\n",
-    "print(seasons.shape)\n",
-    "seasons.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "seasons['name'].unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "abb3aa20-774b-4761-983a-df5eb2bc51c6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Achats = purchases.csv \n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    purchases = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(purchases.columns)\n",
-    "print(purchases.shape)\n",
-    "purchases.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "30e204ab-4f63-430c-a818-5c8035b6e17b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "purchases"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
@ -124,9 +124,7 @@
  {
   "cell_type": "markdown",
   "id": "e855f403",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
+   "metadata": {},
   "source": [
    "## customersplus.csv"
   ]
@ -1289,7 +1287,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.6"
  }
 },
 "nbformat": 4,