Ajouter google_scraping.py

2024-06-27 11:06:25 +02:00 · 2024-06-27 11:06:25 +02:00 · 32ef2b7fb9
commit 32ef2b7fb9
parent 244abd9c6a
1 changed files with 102 additions and 0 deletions
--- a/google_scraping.py
+++ b/google_scraping.py
@ -0,0 +1,102 @@
+from selenium import webdriver
+from bs4 import BeautifulSoup
+import time
+from icecream import ic
+
+
+# Google scraping for restaurants in Le Fooding
+def google_scrap_fooding(df):
+    price_ranges = []
+
+    for i in range(len(df['Name'])):
+        name = df.iloc[i]['Name']
+        address = df.iloc[i]['Address']
+
+        options = webdriver.ChromeOptions()
+        options.add_argument("---headless=True")
+        driver = webdriver.Chrome(options=options)
+        base_url = 'https://www.google.com/search?hl=en&q='
+
+        url = base_url + str(name).replace(' ', '-') + str(address).replace(' ', '-')
+        ic()
+        ic(url)
+
+        driver.get(url)
+        time.sleep(0.1)
+
+        page_source = driver.page_source
+        driver.quit()
+        soup = BeautifulSoup(page_source, features='html.parser')
+
+        all_span = [_.text for _ in soup.find_all('span')]
+
+        price_range = ''.join([_ for _ in all_span if '€' in _ and '–' in _]).split('...')[-1][1:].split('€')[0]
+        ic(price_range)
+
+        if len(price_range) < 8:
+            price_ranges.append(price_range)
+        else:
+            price_ranges.append('')
+
+    df['Price Range'] = price_ranges
+
+    return df
+
+
+def get_price_range_fute(soup):
+    price_range = (''.join([_.text for _ in soup.find_all('span') if 'Reported by' in _.text]).
+                   strip().split('R')[0].split('€')[-1])
+
+    if price_range == '':
+        all_span = [_.text for _ in soup.find_all('span')]
+        price_range = [_ for _ in all_span if '· Prix. de' in _]
+        price_range = ''.join(price_range).split('€')[0].split('· Prix. de')[-1].strip()
+
+    return price_range
+
+
+# Google scraping for restaurants in Petit Fute
+def get_mean_price_fute(soup):
+    mean_price = [_.text for _ in soup.find_all('span') if '(Les prix ont été fournis par le restaurant)' in _.text]
+    mean_price = ''.join(mean_price).split('€')[0].split('Price range: ')[-1].strip()
+    return mean_price
+
+
+def google_scrap_fute(df):
+    price_ranges = []
+    mean_prices = []
+
+    ic()
+    for name, address in zip(df['Name'], df['Address']):
+        ic()
+        options = webdriver.ChromeOptions()
+        options.add_argument("---headless=True")
+        driver = webdriver.Chrome(options=options)
+        base_url = 'https://www.google.com/search?hl=en&q='
+        try:
+            url = base_url + name.replace(' ', '-') + address.replace(' ', '-')
+            ic(url)
+
+            driver.get(url)
+            time.sleep(0.1)
+
+            page_source = driver.page_source
+            driver.quit()
+            soup = BeautifulSoup(page_source, features='html.parser')
+
+            price_range = get_price_range_fute(soup)
+            ic(price_range)
+            mean_price = get_mean_price_fute(soup)
+            ic(mean_price)
+            price_ranges.append(price_range)
+            mean_prices.append(mean_price)
+
+        except AttributeError or ConnectionError:
+            ic()
+            price_ranges.append('NA')
+            mean_prices.append('NA')
+
+    df['Price Range'] = price_ranges
+    df['Mean Price'] = mean_prices
+
+    return df