Merge branch 'data_construction'

2024-02-20 22:46:14 +00:00 · 2024-02-20 22:46:14 +00:00 · 29eafcc6b2
commit 29eafcc6b2
parent 1f0892434f 2629502a08
8 changed files with 348 additions and 1030 deletions
--- a/0_1_Input_cleaning.py
+++ b/0_1_Input_cleaning.py
@ -0,0 +1,58 @@
 # Business Data Challenge - Team 1
 import pandas as pd
 import numpy as np
 import os
 import s3fs
 import re
 import warnings
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 # Import cleaning and merge functions
 exec(open('0_Cleaning_and_merge_functions.py').read())
 # Output folder 
 BUCKET_OUT = "projet-bdc2324-team1"
 # Ignore warning
 warnings.filterwarnings('ignore')
 def export_dataset(df, output_name):
    print('Exportation of dataset :', output_name)
    FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + output_name
    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
        df.to_csv(file_out, index = False)
 ## 1 - Cleaning of the datasets
 for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"): 
    # Cleaning customerplus
    df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
    ## Exportation 
    export_dataset(df = df1_customerplus_clean, output_name = "0_Input/Company_"+ tenant_id +"/customerplus_cleaned.csv")
    # Cleaning target area
    df1_target_information = preprocessing_target_area(directory_path = tenant_id)
    ## Exportation 
    export_dataset(df = df1_target_information, output_name = "0_Input/Company_"+ tenant_id +"/target_information.csv")
    # Cleaning campaign area
    df1_campaigns_information = preprocessing_campaigns_area(directory_path = tenant_id)
    ## Exportation 
    export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
    ## Exportation 
    # export_dataset(df = df1_campaigns_information, output_name = "0_Temp/Company 1 - Campaigns dataset clean.csv")
    # Cleaning product area
    df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
    ## Exportation 
    export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
    #Exportation 
    # export_dataset(df = df1_products_purchased_reduced, output_name = "0_Temp/Company 1 - Purchases.csv")
    print("\n ------------------------------------------------------------------ \n --------------------- END CLEANING COMPANY " + tenant_id + " --------------------- \n ------------------------------------------------------------------")
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@ -0,0 +1,128 @@
 # Business Data Challenge - Team 1
 import pandas as pd
 import numpy as np
 import os
 import s3fs
 import re
 import warnings
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 # Import cleaning and merge functions
 exec(open('0_KPI_functions.py').read())
 # Ignore warning
 warnings.filterwarnings('ignore')
 def dataset_construction(min_date, end_features_date, max_date, directory_path):
    # Import customerplus
    df_customerplus_clean = display_databases(directory_path, file_name = "customerplus_cleaned")
    df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
    df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
    # Filtre de cohérence pour la mise en pratique de notre méthode
    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') 
    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
    #Filtre de la base df_campaigns_information
    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
    #Filtre de la base df_products_purchased_reduced
    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
    print("Data filtering : SUCCESS")
    # Fusion de l'ensemble et creation des KPI
    # KPI sur les campagnes publicitaires
    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) 
    # KPI sur le comportement d'achat
    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
    # KPI sur les données socio-demographique    
    ## Le genre
    df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
        0: 'female',
        1: 'male',
        2: 'other'
    })
    gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
    df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
    ## Indicatrice si individue vit en France
    df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
    print("KPIs construction : SUCCESS")
    # Fusion avec KPI liés au customer
    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
    # Fill NaN values
    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
    # Fusion avec KPI liés au comportement d'achat
    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
    # Fill NaN values
    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
    print("Explanatory variable construction : SUCCESS")
    # 2. Construction of the explained variable 
    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
    # Indicatrice d'achat
    df_products_purchased_to_predict['y_has_purchased'] = 1
    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
    print("Explained variable construction : SUCCESS")
    # 3. Merge between explained and explanatory variables
    dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
    # 0 if there is no purchase
    dataset[['y_has_purchased']].fillna(0)    
    return dataset
 ## Exportation
 # Dossier d'exportation
 BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
 # Dataset test
 dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1")
 # # Exportation
 # FILE_KEY_OUT_S3 = "dataset_test.csv"
 # FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
 # with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
 #     dataset_test.to_csv(file_out, index = False)
 # print("Exportation dataset test : SUCCESS")
 # Dataset train
 dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1")
 # Exportation
 # FILE_KEY_OUT_S3 = "dataset_train.csv"
 # FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
 # with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
 #     dataset_train.to_csv(file_out, index = False)
 # print("Exportation dataset train : SUCCESS")
 print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
--- a/0_Cleaning_and_merge.py
+++ b/0_Cleaning_and_merge.py
@ -1,193 +0,0 @@
 # Business Data Challenge - Team 1
 import pandas as pd
 import numpy as np
 import os
 import s3fs
 import re
 import warnings
 # Import cleaning and merge functions
 exec(open('BDC-team-1/0_Cleaning_and_merge_functions.py').read())
 exec(open('BDC-team-1/0_KPI_functions.py').read())
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 # Ignore warning
 warnings.filterwarnings('ignore')
 # Data loading
 BUCKET = "bdc2324-data/1"
 liste_database = fs.ls(BUCKET)
 # loop to create dataframes from liste
 client_number = liste_database[0].split("/")[1]
 df_prefix = "df" + str(client_number) + "_"
 for i in range(len(liste_database)) :
    current_path = liste_database[i]
    with fs.open(current_path, mode="rb") as file_in:
        df = pd.read_csv(file_in)
        # the pattern of the name is df1xxx
        nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
        globals()[nom_dataframe] = df
 ## 1 - Cleaning of the datasets
 # Cleaning customerplus
 df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)
 # Cleaning target area
 df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)
 # Cleaning campaign area
 df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)
 # Exportation 
 BUCKET_OUT = "projet-bdc2324-team1"
 FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Campaigns dataset clean.csv"
 FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
 with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    df1_campaigns_information.to_csv(file_out, index = False)
 ## Cleaning product area
 # Cleaning ticket area
 df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)
 BUCKET = "bdc2324-data"
 directory_path = '1'
 products_theme = create_products_table()
 events_theme= create_events_table()
 representation_theme = create_representations_table()
 products_global = uniform_product_df()
 # Fusion liée au product
 df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
 # Selection des variables d'intérêts
 df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
 #Exportation 
 BUCKET_OUT = "projet-bdc2324-team1"
 FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Purchases.csv"
 FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
 with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    df1_products_purchased_reduced.to_csv(file_out, index = False)
 ## 2 - Construction of KPIs on a given period
 def explanatory_variables(min_date, max_date, df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean):
    # Filtre de cohérence pour la mise en pratique de notre méthode
    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') 
    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
    #Filtre de la base df_campaigns_information
    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= max_date) & (df_campaigns_information['sent_at'] >= min_date)]
    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= max_date] = np.datetime64('NaT')
    #Filtre de la base df_products_purchased_reduced
    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
    print("Data filtering : SUCCESS")
    # Fusion de l'ensemble et creation des KPI
    # KPI sur les campagnes publicitaires
    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) 
    # KPI sur le comportement d'achat
    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
    # KPI sur les données socio-demographique    
    ## Le genre
    df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
        0: 'female',
        1: 'male',
        2: 'other'
    })
    gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
    df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
    ## Indicatrice si individue vit en France
    df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
    print("KPIs construction : SUCCESS")
    # Fusion avec KPI liés au customer
    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
    # Fill NaN values
    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
    # Fusion avec KPI liés au comportement d'achat
    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
    # Fill NaN values
    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
    print("Explanatory variable construction : SUCCESS")
    return df_customer_product
 # Fonction pour créer les variables expliquée
 def explained_variable(min_date, max_date, df_products_purchased_reduced = df1_products_purchased_reduced):
    # Filtrer la base d'achat
    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > min_date)]
    # Indicatrice d'achat
    df_products_purchased_reduced['y_has_purchased'] = 1
    y = df_products_purchased_reduced[['customer_id', 'event_type_id', 'y_has_purchased']].drop_duplicates()
    print("Explained variable construction : SUCCESS")
    return y
 ## Exportation
 # Dossier d'exportation
 BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
 # Dataset test
 X_test = explanatory_variables(min_date = "2021-08-01", max_date = "2023-08-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
 y_test = explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced)
 dataset_test = pd.merge(X_test, y_test, on = ['customer_id', 'event_type_id'], how = 'left')
 # Exportation
 FILE_KEY_OUT_S3 = "dataset_test.csv"
 FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
 with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    dataset_test.to_csv(file_out, index = False)
 print("Exportation dataset test : SUCCESS")
 # Dataset train
 X_train = explanatory_variables(min_date = "2021-05-01", max_date = "2023-05-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean)
 y_train = explained_variable(min_date = "2023-05-01", max_date = "2023-08-01", df_products_purchased_reduced = df1_products_purchased_reduced)
 dataset_train = pd.merge(X_train, y_train, on = ['customer_id', 'event_type_id'], how = 'left')
 # Exportation
 FILE_KEY_OUT_S3 = "dataset_train.csv"
 FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
 with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    dataset_train.to_csv(file_out, index = False)
 print("Exportation dataset train : SUCCESS")
 print("FIN DE LA GENERATION DES DATASETS : SUCCESS")
--- a/0_Cleaning_and_merge_functions.py
+++ b/0_Cleaning_and_merge_functions.py
@ -1,38 +1,92 @@
-# Cleaning and merge functions
+#### Cleaning and merge functions ####
-# Cleaning function
+BUCKET = "bdc2324-data"
 # 1. Basic cleaning functions
 def cleaning_date(df, column_name):
    """
-    Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.
+    Datetime columns cleaning with ISO format
    Parameters:
    - df: DataFrame
        Le DataFrame contenant la colonne à nettoyer.
    - column_name: str
        Le nom de la colonne à nettoyer.
    Returns:
    - DataFrame
        Le DataFrame modifié avec la colonne nettoyée.
    """
    df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')
    return df
-def preprocessing_customerplus(customerplus = None):
+def display_databases(directory_path, file_name):
    """
    This function returns the file from s3 storage
    """
    file_path = BUCKET + "/" + directory_path + "/" + directory_path + file_name + ".csv"
    print("File path : ", file_path)
    with fs.open(file_path, mode="rb") as file_in:
        df = pd.read_csv(file_in, sep=",")
    print("Shape : ", df.shape)
    return df
-    customerplus_copy = customerplus.copy()
+def remove_horodates(df):
    """
    this function remove horodate columns like created_at and updated_at
    """
    df = df.drop(columns = ["created_at", "updated_at"])
    return df
 def order_columns_id(df):
    """
    this function puts all id columns at the beginning in order to read the dataset easier
    """
    substring = 'id'
    id_columns = [col for col in df.columns if substring in col]
    remaining_col = [col for col in df.columns if substring not in col]
    new_order = id_columns + remaining_col
    return df[new_order]
 def process_df_2(df):
    """
    This function organizes dataframe
    """
    df = remove_horodates(df)
    print("Number of columns : ", len(df.columns))
    df = order_columns_id(df)
    print("Columns : ", df.columns)
    return df
 def load_dataset(directory_path, name):
    """
    This function loads csv file
    """
    df = display_databases(directory_path, file_name = name)
    df = process_df_2(df)
    # drop na :
    #df = df.dropna(axis=1, thresh=len(df))
    # if identifier in table : delete it
    if 'identifier' in df.columns:
        df = df.drop(columns = 'identifier')
    return df
 # 2. Creation of cleaned and merged datasets
 def preprocessing_customerplus(directory_path):
    customerplus_copy = load_dataset(directory_path, name = "customersplus")
    # Passage en format date
    cleaning_date(customerplus_copy, 'first_buying_date')
    cleaning_date(customerplus_copy, 'last_visiting_date')
    # Selection des variables
-    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
+    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
    customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
    return customerplus_copy
-def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):
+def preprocessing_tickets_area(directory_path):
    # Datasets loading
    tickets = load_dataset(directory_path, name = "tickets")
    purchases = load_dataset(directory_path, name = "purchases")
    suppliers = load_dataset(directory_path, name = "suppliers")
    type_ofs = load_dataset(directory_path, name = "type_ofs")
    # Base des tickets
    tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
    tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
@ -48,7 +102,7 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non
    # Base des achats
    # Nettoyage de la date d'achat
-    cleaning_date(purchases, 'purchase_date')
+    # cleaning_date(purchases, 'purchase_date')
    # Selection des variables
    purchases = purchases[['id', 'purchase_date', 'customer_id']]
@ -67,8 +121,13 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non
    return ticket_information
-def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):
+def preprocessing_target_area(directory_path):
-    # Target.csv cleaning
+
    # Datasets loading
    targets = load_dataset(directory_path, name = "targets")
    target_types = load_dataset(directory_path, name = "target_types")
    customer_target_mappings = load_dataset(directory_path, name = "customer_target_mappings")
    # target cleaning
    targets = targets[["id", "target_type_id", "name"]]
    targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)
@ -88,16 +147,21 @@ def preprocessing_target_area(targets = None, target_types = None, customer_targ
    return targets_full
-def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
+def preprocessing_campaigns_area(directory_path):
    # Datasets loading
    campaign_stats = load_dataset(directory_path, name = "campaign_stats")
    campaigns = load_dataset(directory_path, name = "campaigns")
    # campaign_stats cleaning 
    campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]]
-    cleaning_date(campaign_stats, 'opened_at')
+    # cleaning_date(campaign_stats, 'opened_at')
-    cleaning_date(campaign_stats, 'sent_at')
+    # cleaning_date(campaign_stats, 'sent_at')
-    cleaning_date(campaign_stats, 'delivered_at')
+    # cleaning_date(campaign_stats, 'delivered_at')
    # campaigns cleaning
    campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_")
-    cleaning_date(campaigns, 'campaign_sent_at')
+    # cleaning_date(campaigns, 'campaign_sent_at')
    # Merge 
    campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left")
@ -105,66 +169,11 @@ def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
    return campaigns_full
-def display_databases(file_name):
+def create_products_table(directory_path):
    """
    This function returns the file from s3 storage
    """
    file_path = BUCKET + "/" + directory_path + "/" + file_name
    print("File path : ", file_path)
    with fs.open(file_path, mode="rb") as file_in:
        df = pd.read_csv(file_in, sep=",")
    print("Shape : ", df.shape)
    return df
 def remove_horodates(df):
    """
    this function remove horodate columns like created_at and updated_at
    """
    df = df.drop(columns = ["created_at", "updated_at"])
    return df
 def order_columns_id(df):
    """
    this function puts all id columns at the beginning in order to read the dataset easier
    """
    substring = 'id'
    id_columns = [col for col in df.columns if substring in col]
    remaining_col = [col for col in df.columns if substring not in col]
    new_order = id_columns + remaining_col
    return df[new_order]
 def process_df_2(df):
    """
    This function organizes dataframe
    """
    df = remove_horodates(df)
    print("Number of columns : ", len(df.columns))
    df = order_columns_id(df)
    print("Columns : ", df.columns)
    return df
 def load_dataset(name):
    """
    This function loads csv file
    """
    df = display_databases(name)
    df = process_df_2(df)
    # drop na :
    #df = df.dropna(axis=1, thresh=len(df))
    # if identifier in table : delete it
    if 'identifier' in df.columns:
        df = df.drop(columns = 'identifier')
    return df
 def create_products_table():
    # first merge products and categories
    print("first merge products and categories")
-    products = load_dataset("1products.csv")
+    products = load_dataset(directory_path, name = "products")
-    categories = load_dataset("1categories.csv")
+    categories = load_dataset(directory_path, name = "categories")
    # Drop useless columns
    products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])
    categories = categories.drop(columns = ['extra_field', 'quota'])
@ -176,7 +185,7 @@ def create_products_table():
    # Second merge products_theme and type of categories
    print("Second merge products_theme and type of categories")
-    type_of_categories = load_dataset("1type_of_categories.csv")
+    type_of_categories = load_dataset(directory_path, name = "type_of_categories")
    type_of_categories = type_of_categories.drop(columns = 'id')
    products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
                                          right_on = 'category_id' )
@ -187,11 +196,11 @@ def create_products_table():
    return products_theme
-def create_events_table():
+def create_events_table(directory_path):
    # first merge events and seasons : 
    print("first merge events and seasons : ")
-    events = load_dataset("1events.csv")
+    events = load_dataset(directory_path, name = "events")
-    seasons = load_dataset("1seasons.csv")
+    seasons = load_dataset(directory_path, name = "seasons")
    # Drop useless columns
    events = events.drop(columns = ['manual_added', 'is_display'])
@ -201,7 +210,7 @@ def create_events_table():
    # Secondly merge events_theme and event_types
    print("Secondly merge events_theme and event_types : ")
-    event_types = load_dataset("1event_types.csv")
+    event_types = load_dataset(directory_path, name = "event_types")
    event_types = event_types.drop(columns = ['fidelity_delay'])
    events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
@ -210,7 +219,7 @@ def create_events_table():
    # thirdly merge events_theme and facilities
    print("thirdly merge events_theme and facilities : ")
-    facilities = load_dataset("1facilities.csv")
+    facilities = load_dataset(directory_path, name = "facilities")
    facilities = facilities.drop(columns = ['fixed_capacity'])
    events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
@ -222,14 +231,13 @@ def create_events_table():
    events_theme  = order_columns_id(events_theme)
    return events_theme
-
+def create_representations_table(directory_path):
-def create_representations_table():
+    representations = load_dataset(directory_path, name = "representations")
    representations = load_dataset("1representations.csv")
    representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
                                                     'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
                                                     'representation_type_id'])
-    representations_capacity = load_dataset("1representation_category_capacities.csv")
+    representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
    representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
    representations_theme = representations.merge(representations_capacity, how='left',
@ -240,22 +248,27 @@ def create_representations_table():
    representations_theme = order_columns_id(representations_theme)
    return representations_theme
-def uniform_product_df():
+def uniform_product_df(directory_path):
    """
    This function returns the uniform product dataset
    """
    products_theme = create_products_table(directory_path)
    representation_theme = create_representations_table(directory_path)
    events_theme = create_events_table(directory_path)
    ticket_information = preprocessing_tickets_area(directory_path)
    print("Products theme columns : ", products_theme.columns)
    print("\n Representation theme columns : ", representation_theme.columns)
    print("\n Events theme columns : ", events_theme.columns)
-    products_global = products_theme.merge(representation_theme, how='left',
+    products_global = pd.merge(products_theme, representation_theme, how='left',
                                           on= ["representation_id", "category_id"])
-    products_global = products_global.merge(events_theme, how='left', on='event_id',
+    products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
                                            suffixes = ("_representation", "_event"))
-    products_global = order_columns_id(products_global)
+    products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
    products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
-    # remove useless columns 
+    return products_purchased_reduced
    products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'
    return products_global
--- a/0_KPI_functions.py
+++ b/0_KPI_functions.py
@ -1,6 +1,20 @@
 # Function de construction de KPI
 def custom_date_parser(date_string):
    return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
 def display_databases(directory_path, file_name, datetime_col = None):
    """
    This function returns the file from s3 storage 
    """
    file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
    print("File path : ", file_path)
    with fs.open(file_path, mode="rb") as file_in:
        df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)        
    return df       
 def campaigns_kpi_function(campaigns_information = None):
    # Nombre de campagnes de mails
    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
@ -29,23 +43,23 @@ def campaigns_kpi_function(campaigns_information = None):
 def tickets_kpi_function(tickets_information = None):
    tickets_information_copy = tickets_information.copy()
-
+    
    # Dummy : Canal de vente en ligne
    liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
    tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)
    # Proportion de vente en ligne
-    prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()
+    prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index()
    prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)
    # Average amount
-    avg_amount =  (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
+    # avg_amount =  (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
-              .agg({"amount" : "mean"}).reset_index()
+    #           .agg({"amount" : "mean"}).reset_index()
-              .rename(columns = {'amount' : 'avg_amount'}))
+    #           .rename(columns = {'amount' : 'avg_amount'}))
-    tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id',  'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
+    tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
-                   .groupby(['customer_id', 'event_type_id']) 
+                   .groupby(['customer_id']) 
                   .agg({'ticket_id': 'count', 
                         'purchase_id' : 'nunique',
                         'amount' : 'sum',
@ -61,8 +75,7 @@ def tickets_kpi_function(tickets_information = None):
                                  'purchase_id_nunique' : 'nb_purchases',
                                  'amount_sum' : 'total_amount',
                                  'supplier_name_nunique' : 'nb_suppliers', 
-                                  'customer_id_' : 'customer_id',
+                                  'customer_id_' : 'customer_id'}, inplace = True)
                                  'event_type_id_' : 'event_type_id'}, inplace = True)
    tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
    tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
@ -73,10 +86,10 @@ def tickets_kpi_function(tickets_information = None):
    tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
-    tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')
+    tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
    tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)
-    tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
+    # tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
    return tickets_kpi
--- a/1_Descriptive_Statistics.ipynb
+++ b/1_Descriptive_Statistics.ipynb
@ -615,19 +615,15 @@
    "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    purchases = pd.read_csv(file_in, sep=\",\")\n",
+    "    purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'])\n",
    "    \n",
    "purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True, format = 'ISO8601')\n",
    "\n",
    "# Emails\n",
    "BUCKET = \"projet-bdc2324-team1\"\n",
    "FILE_KEY_S3 = \"0_Temp/Company 1 - Campaigns dataset clean.csv\"\n",
    "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaigns = pd.read_csv(file_in, sep=\",\")\n",
+    "    campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = ['sent_at'])\n"
    "\n",
    "campaigns['sent_at'] = pd.to_datetime(campaigns['sent_at'], utc = True, format = 'ISO8601')\n"
   ]
  },
  {
@ -818,7 +814,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 16,
   "id": "f663d68b-8a5c-4804-b31a-4477a03ca1e4",
   "metadata": {
    "scrolled": true
@ -906,7 +902,7 @@
       "max    641981.000000  1.256574e+06"
      ]
     },
-     "execution_count": 33,
+     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -917,7 +913,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
   "id": "d1212b10-3933-450a-b001-9e2cbf308f79",
   "metadata": {},
   "outputs": [
@ -1219,7 +1215,7 @@
       "[1826672 rows x 15 columns]"
      ]
     },
-     "execution_count": 16,
+     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1238,7 +1234,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
   "id": "dc45c1cd-2a78-48a6-aa2b-6a501254b6f2",
   "metadata": {},
   "outputs": [
@ -1458,7 +1454,7 @@
       "[5 rows x 40 columns]"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1478,7 +1474,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
   "id": "89fcb455-efb4-4ad4-ab88-efd6c8a76287",
   "metadata": {},
   "outputs": [
@ -1499,7 +1495,7 @@
       "      dtype='object')"
      ]
     },
-     "execution_count": 18,
+     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1510,7 +1506,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
   "id": "d7b2356a-d5fc-4547-b3ff-fded0e304fb6",
   "metadata": {},
   "outputs": [
@ -1634,7 +1630,7 @@
       "9                  0.0  "
      ]
     },
-     "execution_count": 19,
+     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1653,7 +1649,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
   "id": "5559748f-1745-4651-a9f6-94702c7ee66f",
   "metadata": {},
   "outputs": [
@ -1813,7 +1809,7 @@
       "max             434.000000  "
      ]
     },
-     "execution_count": 20,
+     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1835,7 +1831,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
   "id": "4971e35d-a762-4e18-9443-fd9571bd3f1e",
   "metadata": {},
   "outputs": [
@ -1864,7 +1860,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
   "id": "bc65a711-d172-4839-b487-3047280fc3a6",
   "metadata": {},
   "outputs": [
@ -1894,7 +1890,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
   "id": "c95cc35c-abfc-47c7-9b8a-ac69bfd60dd8",
   "metadata": {},
   "outputs": [
@ -1922,7 +1918,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
   "id": "49d5fd2d-9bc1-43ac-9270-1efd73759854",
   "metadata": {},
   "outputs": [
@ -1967,7 +1963,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
   "id": "e50e2583-4b8f-478e-87ac-591dde200af8",
   "metadata": {},
   "outputs": [
@ -1988,7 +1984,7 @@
       "      dtype='object')"
      ]
     },
-     "execution_count": 25,
+     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1999,7 +1995,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 27,
   "id": "c724a315-9fe8-4874-be8f-a8115b17b5e2",
   "metadata": {},
   "outputs": [],
@ -2021,7 +2017,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 28,
   "id": "58af5dcb-673e-4f4d-ad5c-f66ce1e8a22c",
   "metadata": {},
   "outputs": [
@ -2042,7 +2038,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 29,
   "id": "cc3437f7-8b36-4398-9da6-ff15e8e4c8d7",
   "metadata": {},
   "outputs": [
--- a/Brouillon_AJ.ipynb
+++ b/Brouillon_AJ.ipynb
@ -1,695 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
   "metadata": {},
   "source": [
    "# Business Data Challenge - Team 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
   "metadata": {},
   "source": [
    "Configuration de l'accès aux données"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import s3fs\n",
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
    "\n",
    "BUCKET = \"bdc2324-data\"\n",
    "fs.ls(BUCKET)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Chargement des fichiers campaign_stats.csv\n",
    "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Conversion des dates 'sent_at'\n",
    "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
    "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
    "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
    "print(campaign_stats_1['sent_at'].max())\n",
    "print(campaign_stats_1['sent_at'].min())\n",
    "\n",
    "print(campaign_stats_2['sent_at'].max())\n",
    "print(campaign_stats_2['sent_at'].min())\n",
    "\n",
    "print(campaign_stats_3['sent_at'].max())\n",
    "print(campaign_stats_3['sent_at'].min())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "campaign_stats_1['sent_at']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31f2edbf-5661-4516-9835-06d4da615c13",
   "metadata": {},
   "source": [
    "### Customersplus.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
   "metadata": {},
   "outputs": [],
   "source": [
    "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "460f853a-68c0-42a7-9877-b83d3aaec813",
   "metadata": {},
   "outputs": [],
   "source": [
    "customers_plus_1.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "customers_plus_1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
   "metadata": {},
   "outputs": [],
   "source": [
    "customers_plus_1['id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
   "metadata": {},
   "outputs": [],
   "source": [
    "customers_plus_2['id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b40a653e-013f-48d0-8b57-0284587b36c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32fa2215-3c79-40b5-8643-755865959fc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
    "# Exemple id commun = caractéristiques communes\n",
    "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
    "\n",
    "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "customers_plus_1.isna().mean()*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f6ce60d-0912-497d-9108-330acccef394",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Chargement de toutes les données\n",
    "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
    "\n",
    "for nom_base in liste_base:\n",
    "    FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
    "    with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "        globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Jointure\n",
    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
    "df_customer_event"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f",
   "metadata": {},
   "source": [
    "# Fusion et exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22bfad2b-d52a-4077-9b39-bee35004e01c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Jointure\n",
    "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
    "\n",
    "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
    "\n",
    "var_choosed.remove('representation_id')\n",
    "var_choosed.extend(['start_date_time', 'event_id'])\n",
    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
    "\n",
    "var_choosed.remove('event_id')\n",
    "var_choosed.extend(['name', 'customer_id'])\n",
    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
    "\n",
    "# Changement de nom\n",
    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
    "var_choosed[var_choosed.index('name')] = \"event_name\"\n",
    "\n",
    "# Base finale\n",
    "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
    "df_customer_event"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31",
   "metadata": {},
   "source": [
    "## Type de client au globale"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Client\n",
    "print(customer_target_mappings.columns)\n",
    "print(customer_target_mappings.shape)\n",
    "customer_target_mappings.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f",
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_target_mappings['extra_field'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "customer_target_mappings['name'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Segmentation existante\n",
    "print(target_types.columns)\n",
    "print(target_types.shape)\n",
    "target_types.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5adb1773-648d-4683-bc08-d1f2298c1283",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "target_types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d65f74e-47fc-4296-b493-a1ebefb91cde",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tags = clients\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    tags = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(tags.columns)\n",
    "print(tags.shape)\n",
    "tags.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a689a63-165b-4c4e-bbb0-695b661048d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69e38c52-0570-4531-aebb-9deb6db8c40b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Structure = clients\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(structure_tag_mappings.columns)\n",
    "print(structure_tag_mappings.shape)\n",
    "structure_tag_mappings.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74dc34ad-375b-48df-a900-40d92c5fff13",
   "metadata": {},
   "outputs": [],
   "source": [
    "structure_tag_mappings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tags = clients\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    customersplus = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(customersplus.columns)\n",
    "print(customersplus.shape)\n",
    "customersplus.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "383e892c-606a-45ce-bdd6-b503b3e0be33",
   "metadata": {},
   "outputs": [],
   "source": [
    "customersplus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70324d06-b855-4386-a7de-eef1eb13dfdf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# tickets\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    tickets = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(tickets.columns)\n",
    "print(tickets.shape)\n",
    "tickets.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea83ea5c-3d47-4a66-a523-04b69b149a20",
   "metadata": {},
   "outputs": [],
   "source": [
    "tickets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6",
   "metadata": {},
   "outputs": [],
   "source": [
    "tickets['type_of'].unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc192b08-30a5-486a-8bea-93e765dbfce6",
   "metadata": {},
   "source": [
    "## Types d'évenement et client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e14dcf62-2def-4ed5-834b-cf21abbc2894",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evenement = events.csv\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    events = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(events.columns)\n",
    "print(events.shape)\n",
    "events.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316",
   "metadata": {},
   "outputs": [],
   "source": [
    "events"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af80eee8-f717-4159-a0fd-09d47ec96621",
   "metadata": {},
   "outputs": [],
   "source": [
    "events['name'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Représentation des évenements = representations.csv\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    representations = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(representations.columns)\n",
    "print(representations.shape)\n",
    "representations.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1487402a-a49b-4737-b7d7-40c764d2f0b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "representations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99b27418-2c15-4a6e-bcf5-d329ca492085",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Produits vendues = products.csv\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    products = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(products.columns)\n",
    "print(products.shape)\n",
    "products.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c49bcd47-672f-4e0f-aee9-a7475151b97f",
   "metadata": {},
   "outputs": [],
   "source": [
    "products"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4aec5ce-d0c9-4625-bb29-9ac154818621",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Lieu = facilities.csv\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    facilities = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(facilities.columns)\n",
    "print(facilities.shape)\n",
    "facilities.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3642483-2879-442a-ad69-efcd2331a200",
   "metadata": {},
   "outputs": [],
   "source": [
    "facilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da1e9807-2a8d-4be7-a785-55cffd734f36",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Saisons = seasons.csv période sur deux années consécutives\n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    seasons = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(seasons.columns)\n",
    "print(seasons.shape)\n",
    "seasons.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9",
   "metadata": {},
   "outputs": [],
   "source": [
    "seasons['name'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abb3aa20-774b-4761-983a-df5eb2bc51c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Achats = purchases.csv \n",
    "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    purchases = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "print(purchases.columns)\n",
    "print(purchases.shape)\n",
    "purchases.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30e204ab-4f63-430c-a818-5c8035b6e17b",
   "metadata": {},
   "outputs": [],
   "source": [
    "purchases"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
@ -124,9 +124,7 @@
  {
   "cell_type": "markdown",
   "id": "e855f403",
-   "metadata": {
+   "metadata": {},
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## customersplus.csv"
   ]
@ -1289,7 +1287,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.6"
  }
 },
 "nbformat": 4,