From 32ef2b7fb9c3c582ce44a7f06db04367d612875b Mon Sep 17 00:00:00 2001 From: Olivier MEYER Date: Thu, 27 Jun 2024 11:06:25 +0200 Subject: [PATCH] Ajouter google_scraping.py --- google_scraping.py | 102 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 google_scraping.py diff --git a/google_scraping.py b/google_scraping.py new file mode 100644 index 0000000..d7db53d --- /dev/null +++ b/google_scraping.py @@ -0,0 +1,102 @@ +from selenium import webdriver +from bs4 import BeautifulSoup +import time +from icecream import ic + + +# Google scraping for restaurants in Le Fooding +def google_scrap_fooding(df): + price_ranges = [] + + for i in range(len(df['Name'])): + name = df.iloc[i]['Name'] + address = df.iloc[i]['Address'] + + options = webdriver.ChromeOptions() + options.add_argument("---headless=True") + driver = webdriver.Chrome(options=options) + base_url = 'https://www.google.com/search?hl=en&q=' + + url = base_url + str(name).replace(' ', '-') + str(address).replace(' ', '-') + ic() + ic(url) + + driver.get(url) + time.sleep(0.1) + + page_source = driver.page_source + driver.quit() + soup = BeautifulSoup(page_source, features='html.parser') + + all_span = [_.text for _ in soup.find_all('span')] + + price_range = ''.join([_ for _ in all_span if '€' in _ and '–' in _]).split('...')[-1][1:].split('€')[0] + ic(price_range) + + if len(price_range) < 8: + price_ranges.append(price_range) + else: + price_ranges.append('') + + df['Price Range'] = price_ranges + + return df + + +def get_price_range_fute(soup): + price_range = (''.join([_.text for _ in soup.find_all('span') if 'Reported by' in _.text]). + strip().split('R')[0].split('€')[-1]) + + if price_range == '': + all_span = [_.text for _ in soup.find_all('span')] + price_range = [_ for _ in all_span if '· Prix. de' in _] + price_range = ''.join(price_range).split('€')[0].split('· Prix. de')[-1].strip() + + return price_range + + +# Google scraping for restaurants in Petit Fute +def get_mean_price_fute(soup): + mean_price = [_.text for _ in soup.find_all('span') if '(Les prix ont été fournis par le restaurant)' in _.text] + mean_price = ''.join(mean_price).split('€')[0].split('Price range: ')[-1].strip() + return mean_price + + +def google_scrap_fute(df): + price_ranges = [] + mean_prices = [] + + ic() + for name, address in zip(df['Name'], df['Address']): + ic() + options = webdriver.ChromeOptions() + options.add_argument("---headless=True") + driver = webdriver.Chrome(options=options) + base_url = 'https://www.google.com/search?hl=en&q=' + try: + url = base_url + name.replace(' ', '-') + address.replace(' ', '-') + ic(url) + + driver.get(url) + time.sleep(0.1) + + page_source = driver.page_source + driver.quit() + soup = BeautifulSoup(page_source, features='html.parser') + + price_range = get_price_range_fute(soup) + ic(price_range) + mean_price = get_mean_price_fute(soup) + ic(mean_price) + price_ranges.append(price_range) + mean_prices.append(mean_price) + + except AttributeError or ConnectionError: + ic() + price_ranges.append('NA') + mean_prices.append('NA') + + df['Price Range'] = price_ranges + df['Mean Price'] = mean_prices + + return df