From 5a13fc3c316a36ee00b844a14b65b44a42cd3c0e Mon Sep 17 00:00:00 2001
From: ajoubrel-ensae <antoine.joubrel@ensae.fr>
Date: Mon, 19 Feb 2024 22:11:28 +0000
Subject: [PATCH] New structure

---
 0_Cleaning_and_merge.py           |  71 ---
 0_Cleaning_and_merge_functions.py | 213 ++++-----
 1_Descriptive_Statistics.ipynb    |  50 +--
 Brouillon_AJ.ipynb                | 695 ------------------------------
 Exploration_billet_AJ.ipynb       |   6 +-
 5 files changed, 138 insertions(+), 897 deletions(-)
 delete mode 100644 Brouillon_AJ.ipynb

diff --git a/0_Cleaning_and_merge.py b/0_Cleaning_and_merge.py
index 9c9aac0..860cec1 100644
--- a/0_Cleaning_and_merge.py
+++ b/0_Cleaning_and_merge.py
@@ -8,78 +8,7 @@ import re
 import warnings
 
 # Import cleaning and merge functions
-exec(open('BDC-team-1/0_Cleaning_and_merge_functions.py').read())
 exec(open('BDC-team-1/0_KPI_functions.py').read())
-
-# Create filesystem object
-S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
-fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
-
-# Ignore warning
-warnings.filterwarnings('ignore')
-
-# Data loading
-BUCKET = "bdc2324-data/1"
-liste_database = fs.ls(BUCKET)
-
-# loop to create dataframes from liste
-client_number = liste_database[0].split("/")[1]
-df_prefix = "df" + str(client_number) + "_"
-
-for i in range(len(liste_database)) :
-    current_path = liste_database[i]
-    with fs.open(current_path, mode="rb") as file_in:
-        df = pd.read_csv(file_in)
-        # the pattern of the name is df1xxx
-        nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
-        globals()[nom_dataframe] = df
-
-## 1 - Cleaning of the datasets
-
-# Cleaning customerplus
-df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)
-
-# Cleaning target area
-df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)
-
-# Cleaning campaign area
-df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)
-
-# Exportation 
-BUCKET_OUT = "projet-bdc2324-team1"
-FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Campaigns dataset clean.csv"
-FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
-
-with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
-    df1_campaigns_information.to_csv(file_out, index = False)
-## Cleaning product area
-
-# Cleaning ticket area
-df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)
-
-
-BUCKET = "bdc2324-data"
-directory_path = '1'
-
-products_theme = create_products_table()
-events_theme= create_events_table()
-representation_theme = create_representations_table()
-products_global = uniform_product_df()
-
-# Fusion liée au product
-df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
-
-# Selection des variables d'intérêts
-df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
-
-#Exportation 
-BUCKET_OUT = "projet-bdc2324-team1"
-FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Purchases.csv"
-FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
-
-with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
-    df1_products_purchased_reduced.to_csv(file_out, index = False)
-
 ## 2 - Construction of KPIs on a given period
 
 def explanatory_variables(min_date, max_date, df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean):
diff --git a/0_Cleaning_and_merge_functions.py b/0_Cleaning_and_merge_functions.py
index 042c60e..cab8115 100644
--- a/0_Cleaning_and_merge_functions.py
+++ b/0_Cleaning_and_merge_functions.py
@@ -1,38 +1,92 @@
-# Cleaning and merge functions
+#### Cleaning and merge functions ####
 
-# Cleaning function
+BUCKET = "bdc2324-data"
+
+# 1. Basic cleaning functions
 def cleaning_date(df, column_name):
     """
-    Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.
-
-    Parameters:
-    - df: DataFrame
-        Le DataFrame contenant la colonne à nettoyer.
-    - column_name: str
-        Le nom de la colonne à nettoyer.
-
-    Returns:
-    - DataFrame
-        Le DataFrame modifié avec la colonne nettoyée.
+    Datetime columns cleaning with ISO format
     """
     df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')
     return df
 
-def preprocessing_customerplus(customerplus = None):
+def display_databases(directory_path, file_name):
+    """
+    This function returns the file from s3 storage
+    """
+    file_path = BUCKET + "/" + directory_path + "/" + directory_path + file_name + ".csv"
+    print("File path : ", file_path)
+    with fs.open(file_path, mode="rb") as file_in:
+        df = pd.read_csv(file_in, sep=",")
+        
+    print("Shape : ", df.shape)
+    return df
 
-    customerplus_copy = customerplus.copy()
+def remove_horodates(df):
+    """
+    this function remove horodate columns like created_at and updated_at
+    """
+    df = df.drop(columns = ["created_at", "updated_at"])
+    return df
+
+def order_columns_id(df):
+    """
+    this function puts all id columns at the beginning in order to read the dataset easier
+    """
+    substring = 'id'
+    id_columns = [col for col in df.columns if substring in col]
+    remaining_col = [col for col in df.columns if substring not in col]
+    new_order = id_columns + remaining_col
+    return df[new_order]
+
+def process_df_2(df):
+    """
+    This function organizes dataframe
+    """
+    df = remove_horodates(df)
+    print("Number of columns : ", len(df.columns))
+    df = order_columns_id(df)
+    print("Columns : ", df.columns)
+    return df
+
+def load_dataset(directory_path, name):
+    """
+    This function loads csv file
+    """
+    df = display_databases(directory_path, file_name = name)
+    df = process_df_2(df)
+    # drop na :
+    #df = df.dropna(axis=1, thresh=len(df))
+    # if identifier in table : delete it
+    if 'identifier' in df.columns:
+        df = df.drop(columns = 'identifier')
+    return df
+
+
+# 2. Creation of cleaned and merged datasets
+
+def preprocessing_customerplus(directory_path):
+
+    customerplus_copy = load_dataset(directory_path, name = "customersplus")
     
     # Passage en format date
     cleaning_date(customerplus_copy, 'first_buying_date')
     cleaning_date(customerplus_copy, 'last_visiting_date')
     
     # Selection des variables
-    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
+    customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
     customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
 
     return customerplus_copy
 
-def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):
+def preprocessing_tickets_area(directory_path):
+
+    # Datasets loading
+    tickets = load_dataset(directory_path, name = "tickets")
+    purchases = load_dataset(directory_path, name = "purchases")
+    suppliers = load_dataset(directory_path, name = "suppliers")
+    type_ofs = load_dataset(directory_path, name = "type_ofs")
+    
     # Base des tickets
     tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
     tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
@@ -48,7 +102,7 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non
 
     # Base des achats
     # Nettoyage de la date d'achat
-    cleaning_date(purchases, 'purchase_date')
+    # cleaning_date(purchases, 'purchase_date')
     # Selection des variables
     purchases = purchases[['id', 'purchase_date', 'customer_id']]
 
@@ -67,8 +121,13 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non
 
     return ticket_information
 
-def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):
-    # Target.csv cleaning
+def preprocessing_target_area(directory_path):
+
+    # Datasets loading
+    targets = load_dataset(directory_path, name = "targets")
+    target_types = load_dataset(directory_path, name = "target_types")
+    customer_target_mappings = load_dataset(directory_path, name = "customer_target_mappings")
+    # target cleaning
     targets = targets[["id", "target_type_id", "name"]]
     targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)
     
@@ -88,16 +147,21 @@ def preprocessing_target_area(targets = None, target_types = None, customer_targ
 
     return targets_full
 
-def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
+def preprocessing_campaigns_area(directory_path):
+
+    # Datasets loading
+    campaign_stats = load_dataset(directory_path, name = "campaign_stats")
+    campaigns = load_dataset(directory_path, name = "campaigns")
+    
     # campaign_stats cleaning 
     campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]]
-    cleaning_date(campaign_stats, 'opened_at')
-    cleaning_date(campaign_stats, 'sent_at')
-    cleaning_date(campaign_stats, 'delivered_at')
+    # cleaning_date(campaign_stats, 'opened_at')
+    # cleaning_date(campaign_stats, 'sent_at')
+    # cleaning_date(campaign_stats, 'delivered_at')
     
     # campaigns cleaning
     campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_")
-    cleaning_date(campaigns, 'campaign_sent_at')
+    # cleaning_date(campaigns, 'campaign_sent_at')
     
     # Merge 
     campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left")
@@ -105,66 +169,11 @@ def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
 
     return campaigns_full
 
-def display_databases(file_name):
-    """
-    This function returns the file from s3 storage
-    """
-    file_path = BUCKET + "/" + directory_path + "/" + file_name
-    print("File path : ", file_path)
-    with fs.open(file_path, mode="rb") as file_in:
-        df = pd.read_csv(file_in, sep=",")
-        
-    print("Shape : ", df.shape)
-    return df
-
-
-def remove_horodates(df):
-    """
-    this function remove horodate columns like created_at and updated_at
-    """
-    df = df.drop(columns = ["created_at", "updated_at"])
-    return df
-
-
-def order_columns_id(df):
-    """
-    this function puts all id columns at the beginning in order to read the dataset easier
-    """
-    substring = 'id'
-    id_columns = [col for col in df.columns if substring in col]
-    remaining_col = [col for col in df.columns if substring not in col]
-    new_order = id_columns + remaining_col
-    return df[new_order]
-
-
-def process_df_2(df):
-    """
-    This function organizes dataframe
-    """
-    df = remove_horodates(df)
-    print("Number of columns : ", len(df.columns))
-    df = order_columns_id(df)
-    print("Columns : ", df.columns)
-    return df
-
-def load_dataset(name):
-    """
-    This function loads csv file
-    """
-    df = display_databases(name)
-    df = process_df_2(df)
-    # drop na :
-    #df = df.dropna(axis=1, thresh=len(df))
-    # if identifier in table : delete it
-    if 'identifier' in df.columns:
-        df = df.drop(columns = 'identifier')
-    return df
-
-def create_products_table():
+def create_products_table(directory_path):
     # first merge products and categories
     print("first merge products and categories")
-    products = load_dataset("1products.csv")
-    categories = load_dataset("1categories.csv")
+    products = load_dataset(directory_path, name = "products")
+    categories = load_dataset(directory_path, name = "categories")
     # Drop useless columns
     products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])
     categories = categories.drop(columns = ['extra_field', 'quota'])
@@ -176,7 +185,7 @@ def create_products_table():
     
     # Second merge products_theme and type of categories
     print("Second merge products_theme and type of categories")
-    type_of_categories = load_dataset("1type_of_categories.csv")
+    type_of_categories = load_dataset(directory_path, name = "type_of_categories")
     type_of_categories = type_of_categories.drop(columns = 'id')
     products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
                                           right_on = 'category_id' )
@@ -187,11 +196,11 @@ def create_products_table():
     return products_theme
 
 
-def create_events_table():
+def create_events_table(directory_path):
     # first merge events and seasons : 
     print("first merge events and seasons : ")
-    events = load_dataset("1events.csv")
-    seasons = load_dataset("1seasons.csv")
+    events = load_dataset(directory_path, name = "events")
+    seasons = load_dataset(directory_path, name = "seasons")
 
     # Drop useless columns
     events = events.drop(columns = ['manual_added', 'is_display'])
@@ -201,7 +210,7 @@ def create_events_table():
 
     # Secondly merge events_theme and event_types
     print("Secondly merge events_theme and event_types : ")
-    event_types = load_dataset("1event_types.csv")
+    event_types = load_dataset(directory_path, name = "event_types")
     event_types = event_types.drop(columns = ['fidelity_delay'])
     
     events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
@@ -210,7 +219,7 @@ def create_events_table():
 
     # thirdly merge events_theme and facilities
     print("thirdly merge events_theme and facilities : ")
-    facilities = load_dataset("1facilities.csv")
+    facilities = load_dataset(directory_path, name = "facilities")
     facilities = facilities.drop(columns = ['fixed_capacity'])
     
     events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
@@ -222,14 +231,13 @@ def create_events_table():
     events_theme  = order_columns_id(events_theme)
     return events_theme
 
-
-def create_representations_table():
-    representations = load_dataset("1representations.csv")
+def create_representations_table(directory_path):
+    representations = load_dataset(directory_path, name = "representations")
     representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
                                                      'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
                                                      'representation_type_id'])
     
-    representations_capacity = load_dataset("1representation_category_capacities.csv")
+    representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
     representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
 
     representations_theme = representations.merge(representations_capacity, how='left',
@@ -240,22 +248,27 @@ def create_representations_table():
     representations_theme = order_columns_id(representations_theme)
     return representations_theme
 
-def uniform_product_df():
+def uniform_product_df(directory_path):
     """
     This function returns the uniform product dataset
     """
+    products_theme = create_products_table(directory_path)
+    representation_theme = create_representations_table(directory_path)
+    events_theme = create_events_table(directory_path)
+    ticket_information = preprocessing_tickets_area(directory_path)
+
     print("Products theme columns : ", products_theme.columns)
     print("\n Representation theme columns : ", representation_theme.columns)
     print("\n Events theme columns : ", events_theme.columns)
 
-    products_global = products_theme.merge(representation_theme, how='left',
+    products_global = pd.merge(products_theme, representation_theme, how='left',
                                            on= ["representation_id", "category_id"])
     
-    products_global = products_global.merge(events_theme, how='left', on='event_id',
+    products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
                                             suffixes = ("_representation", "_event"))
     
-    products_global = order_columns_id(products_global)
+    products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
+    
+    products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
 
-    # remove useless columns 
-    products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'
-    return products_global
\ No newline at end of file
+    return products_purchased_reduced
\ No newline at end of file
diff --git a/1_Descriptive_Statistics.ipynb b/1_Descriptive_Statistics.ipynb
index e223399..920cdc2 100644
--- a/1_Descriptive_Statistics.ipynb
+++ b/1_Descriptive_Statistics.ipynb
@@ -615,19 +615,15 @@
     "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
     "\n",
     "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    purchases = pd.read_csv(file_in, sep=\",\")\n",
+    "    purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'])\n",
     "    \n",
-    "purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True, format = 'ISO8601')\n",
-    "\n",
     "# Emails\n",
     "BUCKET = \"projet-bdc2324-team1\"\n",
     "FILE_KEY_S3 = \"0_Temp/Company 1 - Campaigns dataset clean.csv\"\n",
     "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
     "\n",
     "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaigns = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "campaigns['sent_at'] = pd.to_datetime(campaigns['sent_at'], utc = True, format = 'ISO8601')\n"
+    "    campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = ['sent_at'])\n"
    ]
   },
   {
@@ -818,7 +814,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 16,
    "id": "f663d68b-8a5c-4804-b31a-4477a03ca1e4",
    "metadata": {
     "scrolled": true
@@ -906,7 +902,7 @@
        "max    641981.000000  1.256574e+06"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -917,7 +913,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "id": "d1212b10-3933-450a-b001-9e2cbf308f79",
    "metadata": {},
    "outputs": [
@@ -1219,7 +1215,7 @@
        "[1826672 rows x 15 columns]"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1238,7 +1234,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "id": "dc45c1cd-2a78-48a6-aa2b-6a501254b6f2",
    "metadata": {},
    "outputs": [
@@ -1458,7 +1454,7 @@
        "[5 rows x 40 columns]"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1478,7 +1474,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "id": "89fcb455-efb4-4ad4-ab88-efd6c8a76287",
    "metadata": {},
    "outputs": [
@@ -1499,7 +1495,7 @@
        "      dtype='object')"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1510,7 +1506,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "id": "d7b2356a-d5fc-4547-b3ff-fded0e304fb6",
    "metadata": {},
    "outputs": [
@@ -1634,7 +1630,7 @@
        "9                  0.0  "
       ]
      },
-     "execution_count": 19,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1653,7 +1649,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "id": "5559748f-1745-4651-a9f6-94702c7ee66f",
    "metadata": {},
    "outputs": [
@@ -1813,7 +1809,7 @@
        "max             434.000000  "
       ]
      },
-     "execution_count": 20,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1835,7 +1831,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "id": "4971e35d-a762-4e18-9443-fd9571bd3f1e",
    "metadata": {},
    "outputs": [
@@ -1864,7 +1860,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "id": "bc65a711-d172-4839-b487-3047280fc3a6",
    "metadata": {},
    "outputs": [
@@ -1894,7 +1890,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "id": "c95cc35c-abfc-47c7-9b8a-ac69bfd60dd8",
    "metadata": {},
    "outputs": [
@@ -1922,7 +1918,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
    "id": "49d5fd2d-9bc1-43ac-9270-1efd73759854",
    "metadata": {},
    "outputs": [
@@ -1967,7 +1963,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 26,
    "id": "e50e2583-4b8f-478e-87ac-591dde200af8",
    "metadata": {},
    "outputs": [
@@ -1988,7 +1984,7 @@
        "      dtype='object')"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1999,7 +1995,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 27,
    "id": "c724a315-9fe8-4874-be8f-a8115b17b5e2",
    "metadata": {},
    "outputs": [],
@@ -2021,7 +2017,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 28,
    "id": "58af5dcb-673e-4f4d-ad5c-f66ce1e8a22c",
    "metadata": {},
    "outputs": [
@@ -2042,7 +2038,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 29,
    "id": "cc3437f7-8b36-4398-9da6-ff15e8e4c8d7",
    "metadata": {},
    "outputs": [
diff --git a/Brouillon_AJ.ipynb b/Brouillon_AJ.ipynb
deleted file mode 100644
index 8f5529a..0000000
--- a/Brouillon_AJ.ipynb
+++ /dev/null
@@ -1,695 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
-   "metadata": {},
-   "source": [
-    "# Business Data Challenge - Team 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
-   "metadata": {},
-   "source": [
-    "Configuration de l'accès aux données"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import s3fs\n",
-    "# Create filesystem object\n",
-    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
-    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
-    "\n",
-    "BUCKET = \"bdc2324-data\"\n",
-    "fs.ls(BUCKET)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chargement des fichiers campaign_stats.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Conversion des dates 'sent_at'\n",
-    "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
-    "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
-    "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
-    "print(campaign_stats_1['sent_at'].max())\n",
-    "print(campaign_stats_1['sent_at'].min())\n",
-    "\n",
-    "print(campaign_stats_2['sent_at'].max())\n",
-    "print(campaign_stats_2['sent_at'].min())\n",
-    "\n",
-    "print(campaign_stats_3['sent_at'].max())\n",
-    "print(campaign_stats_3['sent_at'].min())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "campaign_stats_1['sent_at']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "31f2edbf-5661-4516-9835-06d4da615c13",
-   "metadata": {},
-   "source": [
-    "### Customersplus.csv"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "460f853a-68c0-42a7-9877-b83d3aaec813",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1.columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1['id'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_2['id'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b40a653e-013f-48d0-8b57-0284587b36c5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "32fa2215-3c79-40b5-8643-755865959fc7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
-    "# Exemple id commun = caractéristiques communes\n",
-    "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
-    "\n",
-    "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "customers_plus_1.isna().mean()*100"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6f6ce60d-0912-497d-9108-330acccef394",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chargement de toutes les données\n",
-    "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
-    "\n",
-    "for nom_base in liste_base:\n",
-    "    FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
-    "    with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "        globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "# Jointure\n",
-    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
-    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
-    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
-    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
-    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
-    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
-    "df_customer_event"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f",
-   "metadata": {},
-   "source": [
-    "# Fusion et exploration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "22bfad2b-d52a-4077-9b39-bee35004e01c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Jointure\n",
-    "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
-    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
-    "\n",
-    "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
-    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
-    "\n",
-    "var_choosed.remove('representation_id')\n",
-    "var_choosed.extend(['start_date_time', 'event_id'])\n",
-    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
-    "\n",
-    "var_choosed.remove('event_id')\n",
-    "var_choosed.extend(['name', 'customer_id'])\n",
-    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
-    "\n",
-    "# Changement de nom\n",
-    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
-    "var_choosed[var_choosed.index('name')] = \"event_name\"\n",
-    "\n",
-    "# Base finale\n",
-    "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
-    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
-    "df_customer_event"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31",
-   "metadata": {},
-   "source": [
-    "## Type de client au globale"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Client\n",
-    "print(customer_target_mappings.columns)\n",
-    "print(customer_target_mappings.shape)\n",
-    "customer_target_mappings.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customer_target_mappings['extra_field'].unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customer_target_mappings['name'].unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Segmentation existante\n",
-    "print(target_types.columns)\n",
-    "print(target_types.shape)\n",
-    "target_types.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5adb1773-648d-4683-bc08-d1f2298c1283",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "target_types"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3d65f74e-47fc-4296-b493-a1ebefb91cde",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Tags = clients\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    tags = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(tags.columns)\n",
-    "print(tags.shape)\n",
-    "tags.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8a689a63-165b-4c4e-bbb0-695b661048d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tags"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "69e38c52-0570-4531-aebb-9deb6db8c40b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Structure = clients\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(structure_tag_mappings.columns)\n",
-    "print(structure_tag_mappings.shape)\n",
-    "structure_tag_mappings.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "74dc34ad-375b-48df-a900-40d92c5fff13",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "structure_tag_mappings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Tags = clients\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customersplus = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(customersplus.columns)\n",
-    "print(customersplus.shape)\n",
-    "customersplus.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "383e892c-606a-45ce-bdd6-b503b3e0be33",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customersplus"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "70324d06-b855-4386-a7de-eef1eb13dfdf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# tickets\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    tickets = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(tickets.columns)\n",
-    "print(tickets.shape)\n",
-    "tickets.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ea83ea5c-3d47-4a66-a523-04b69b149a20",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tickets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tickets['type_of'].unique()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bc192b08-30a5-486a-8bea-93e765dbfce6",
-   "metadata": {},
-   "source": [
-    "## Types d'évenement et client"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e14dcf62-2def-4ed5-834b-cf21abbc2894",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Evenement = events.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    events = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(events.columns)\n",
-    "print(events.shape)\n",
-    "events.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "events"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "af80eee8-f717-4159-a0fd-09d47ec96621",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "events['name'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Représentation des évenements = representations.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    representations = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(representations.columns)\n",
-    "print(representations.shape)\n",
-    "representations.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1487402a-a49b-4737-b7d7-40c764d2f0b4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "representations"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "99b27418-2c15-4a6e-bcf5-d329ca492085",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Produits vendues = products.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    products = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(products.columns)\n",
-    "print(products.shape)\n",
-    "products.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c49bcd47-672f-4e0f-aee9-a7475151b97f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "products"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a4aec5ce-d0c9-4625-bb29-9ac154818621",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Lieu = facilities.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    facilities = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(facilities.columns)\n",
-    "print(facilities.shape)\n",
-    "facilities.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b3642483-2879-442a-ad69-efcd2331a200",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "facilities"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "da1e9807-2a8d-4be7-a785-55cffd734f36",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Saisons = seasons.csv période sur deux années consécutives\n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    seasons = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(seasons.columns)\n",
-    "print(seasons.shape)\n",
-    "seasons.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "seasons['name'].unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "abb3aa20-774b-4761-983a-df5eb2bc51c6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Achats = purchases.csv \n",
-    "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    purchases = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "print(purchases.columns)\n",
-    "print(purchases.shape)\n",
-    "purchases.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "30e204ab-4f63-430c-a818-5c8035b6e17b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "purchases"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb
index bec456e..d697ff5 100644
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
@@ -124,9 +124,7 @@
   {
    "cell_type": "markdown",
    "id": "e855f403",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true
-   },
+   "metadata": {},
    "source": [
     "## customersplus.csv"
    ]
@@ -1289,7 +1287,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,