From 979c0fe6a8873d47af2d5421dd7e210aa00950d2 Mon Sep 17 00:00:00 2001
From: arevelle-ensae <alexis.revelle@ensae.fr>
Date: Thu, 29 Feb 2024 08:33:05 +0000
Subject: [PATCH 1/5] fix path

---
 0_2_Dataset_construction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py
index 2e9b9e0..26e3ad3 100644
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@@ -147,7 +147,7 @@ for company in list_of_comp:
 
     # Exportation
     FILE_KEY_OUT_S3 = "dataset_test" + company +  ".csv"
-    FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
+    FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
     
     with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
         dataset_test.to_csv(file_out, index = False)
@@ -159,7 +159,7 @@ for company in list_of_comp:
                                         max_date = final_date, directory_path = company)
     # Export
     FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv" 
-    FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
+    FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_test/" + FILE_KEY_OUT_S3
     
     with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
         dataset_train.to_csv(file_out, index = False)

From b71f842fe419a07c5b0a1482df51f07284e64410 Mon Sep 17 00:00:00 2001
From: arevelle-ensae <alexis.revelle@ensae.fr>
Date: Thu, 29 Feb 2024 09:09:04 +0000
Subject: [PATCH 2/5] add prefix to customer_id

---
 0_2_Dataset_construction.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py
index 26e3ad3..917dee9 100644
--- a/0_2_Dataset_construction.py
+++ b/0_2_Dataset_construction.py
@@ -122,7 +122,10 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
     dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
 
     # 0 if there is no purchase
-    dataset[['y_has_purchased']].fillna(0)    
+    dataset[['y_has_purchased']].fillna(0)
+
+    # add id_company prefix to customer_id
+    dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')
     
     return dataset
 

From b840b2403c6552efb2162ddeca4df43e01ec3c4f Mon Sep 17 00:00:00 2001
From: arevelle-ensae <alexis.revelle@ensae.fr>
Date: Thu, 29 Feb 2024 09:25:40 +0000
Subject: [PATCH 3/5] Generate global modelization datasets

---
 0_3_General_modelization_dataset.py | 68 +++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 0_3_General_modelization_dataset.py

diff --git a/0_3_General_modelization_dataset.py b/0_3_General_modelization_dataset.py
new file mode 100644
index 0000000..0161ea7
--- /dev/null
+++ b/0_3_General_modelization_dataset.py
@@ -0,0 +1,68 @@
+# Business Data Challenge - Team 1
+
+import pandas as pd
+import numpy as np
+import os
+import s3fs
+import re
+import warnings
+from datetime import date, timedelta, datetime
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+
+# Import KPI construction functions
+exec(open('0_KPI_functions.py').read())
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+
+# functions
+def generate_test_set():
+    file_path_list = fs.ls("projet-bdc2324-team1/Generalization/sport/Test_set")
+    test_set = pd.DataFrame()
+    for file in file_path_list:
+        print(file)
+        with fs.open(file, mode="rb") as file_in:
+            df = pd.read_csv(file_in, sep=",")
+        test_set = pd.concat([test_set, df], ignore_index = True)
+    return test_set
+
+
+def generate_train_set():
+    file_path_list = fs.ls("projet-bdc2324-team1/Generalization/sport/Train_set")
+    train_set = pd.DataFrame()
+    for file in file_path_list:
+        print(file)
+        with fs.open(file, mode="rb") as file_in:
+            df = pd.read_csv(file_in, sep=",")
+        train_set = pd.concat([test_set, df], ignore_index = True)
+    return train_set
+
+
+type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
+BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/'
+
+# create test and train datasets
+test_set = generate_test_set()
+train_set = generate_train_set()
+
+# Exportation test set
+FILE_KEY_OUT_S3 = "Test_set.csv"
+FILE_PATH_OUT_S3 = BUCKET_OUT + FILE_KEY_OUT_S3
+
+with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+    test_set.to_csv(file_out, index = False)
+
+print("Exportation dataset test : SUCCESS")
+
+# Exportation train set
+FILE_KEY_OUT_S3 = "Train_set.csv"
+FILE_PATH_OUT_S3 = BUCKET_OUT +  FILE_KEY_OUT_S3
+
+with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
+    train_set.to_csv(file_out, index = False)
+
+print("Exportation dataset train : SUCCESS")

From 766463acaaf945f4bde839d448b1c2fc684b756e Mon Sep 17 00:00:00 2001
From: arevelle-ensae <alexis.revelle@ensae.fr>
Date: Thu, 29 Feb 2024 10:14:16 +0000
Subject: [PATCH 4/5] work on DS dataset

---
 .../Descriptive_statistics/generate_dataset_DS.py  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 Sport/Descriptive_statistics/generate_dataset_DS.py

diff --git a/Sport/Descriptive_statistics/generate_dataset_DS.py b/Sport/Descriptive_statistics/generate_dataset_DS.py
new file mode 100644
index 0000000..889db77
--- /dev/null
+++ b/Sport/Descriptive_statistics/generate_dataset_DS.py
@@ -0,0 +1,14 @@
+import pandas as pd
+import numpy as np
+import os
+import s3fs
+import re
+import warnings
+
+# Create filesystem object
+S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
+fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
+
+# Ignore warning
+warnings.filterwarnings('ignore')
+

From 9c0aff85e574a9294bf9a31db254961640a515d9 Mon Sep 17 00:00:00 2001
From: arevelle-ensae <alexis.revelle@ensae.fr>
Date: Thu, 29 Feb 2024 11:12:46 +0000
Subject: [PATCH 5/5] fix path

---
 0_3_General_modelization_dataset.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/0_3_General_modelization_dataset.py b/0_3_General_modelization_dataset.py
index 0161ea7..2ba6a75 100644
--- a/0_3_General_modelization_dataset.py
+++ b/0_3_General_modelization_dataset.py
@@ -20,8 +20,8 @@ exec(open('0_KPI_functions.py').read())
 warnings.filterwarnings('ignore')
 
 # functions
-def generate_test_set():
-    file_path_list = fs.ls("projet-bdc2324-team1/Generalization/sport/Test_set")
+def generate_test_set(type_of_comp):
+    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Test_set")
     test_set = pd.DataFrame()
     for file in file_path_list:
         print(file)
@@ -31,8 +31,8 @@ def generate_test_set():
     return test_set
 
 
-def generate_train_set():
-    file_path_list = fs.ls("projet-bdc2324-team1/Generalization/sport/Train_set")
+def generate_train_set(type_of_comp):
+    file_path_list = fs.ls(f"projet-bdc2324-team1/Generalization/{type_of_comp}/Train_set")
     train_set = pd.DataFrame()
     for file in file_path_list:
         print(file)
@@ -46,8 +46,8 @@ type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee
 BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}/'
 
 # create test and train datasets
-test_set = generate_test_set()
-train_set = generate_train_set()
+test_set = generate_test_set(type_of_comp)
+train_set = generate_train_set(type_of_comp)
 
 # Exportation test set
 FILE_KEY_OUT_S3 = "Test_set.csv"