Actualiser best_restaurants_scraping.py

2024-07-15 14:42:28 +02:00 · 2024-07-15 14:42:28 +02:00 · b0a9202d8c
commit b0a9202d8c
parent 2b01ab1af8
1 changed files with 16 additions and 2 deletions
--- a/best_restaurants_scraping.py
+++ b/best_restaurants_scraping.py
@ -85,7 +85,7 @@ def scrap(soup):
            os.makedirs(newpath)

        html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' +
-                     str(datetime.date.today()) + '/' + name + '.html')
+                     str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')

        with open(html_path, 'wt', encoding='utf-8') as html_file:
            for line in soup.prettify():
@ -115,6 +115,7 @@ def get_soup(url):


 def complete_scraping():
+    ic()
    url = 'https://bestrestaurantsparis.com/fr/explore/'
    soup = get_soup(url)

@ -127,6 +128,19 @@ def complete_scraping():
        new_soup = get_soup(new_url)
        temporary_df = scrap(new_soup)
        df = pd.concat([df, temporary_df], ignore_index=True)
-        
+
    df = df.dropna(subset='Address')
+
+    prices = df['Price']
+    for i in range(len(prices)):
+        if 'href' in prices[i]:
+            ic()
+            df = df.drop(index=[i, i + 1])
+
+    items = df['Price']
+    for i in range(len(items)):
+        if not pd.isna(items[i]):
+            if items[i].replace('€', '') != '':
+                items[i] = items[i].split(' ')[0]
+
    return df