prix_restos/google_scraping.py

102 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from selenium import webdriver
from bs4 import BeautifulSoup
import time
from icecream import ic
# Google scraping for restaurants in Le Fooding
def google_scrap_fooding(df):
price_ranges = []
for i in range(len(df['Name'])):
name = df.iloc[i]['Name']
address = df.iloc[i]['Address']
options = webdriver.ChromeOptions()
options.add_argument("---headless=True")
driver = webdriver.Chrome(options=options)
base_url = 'https://www.google.com/search?hl=en&q='
url = base_url + str(name).replace(' ', '-') + str(address).replace(' ', '-')
ic()
ic(name)
driver.get(url)
time.sleep(0.1)
page_source = driver.page_source
driver.quit()
soup = BeautifulSoup(page_source, features='html.parser')
all_span = [_.text for _ in soup.find_all('span')]
price_range = ''.join([_ for _ in all_span if '' in _ and '' in _]).split('...')[-1][1:].split('')[0]
ic(price_range)
if len(price_range) < 8:
price_ranges.append(price_range)
else:
price_ranges.append('')
df['Price Range'] = price_ranges
return df
def get_price_range_fute(soup):
price_range = (''.join([_.text for _ in soup.find_all('span') if 'Reported by' in _.text]).
strip().split('R')[0].split('')[-1])
if price_range == '':
all_span = [_.text for _ in soup.find_all('span')]
price_range = [_ for _ in all_span if '· Prix. de' in _]
price_range = ''.join(price_range).split('')[0].split('· Prix. de')[-1].strip()
return price_range
# Google scraping for restaurants in Petit Fute
def get_mean_price_fute(soup):
mean_price = [_.text for _ in soup.find_all('span') if '(Les prix ont été fournis par le restaurant)' in _.text]
mean_price = ''.join(mean_price).split('')[0].split('Price range: ')[-1].strip()
return mean_price
def google_scrap_fute(df):
price_ranges = []
mean_prices = []
ic()
for name, address in zip(df['Name'], df['Address']):
ic()
ic(name)
options = webdriver.ChromeOptions()
options.add_argument("---headless=True")
driver = webdriver.Chrome(options=options)
base_url = 'https://www.google.com/search?hl=en&q='
try:
url = base_url + name.replace(' ', '-') + address.replace(' ', '-')
driver.get(url)
time.sleep(0.1)
page_source = driver.page_source
driver.quit()
soup = BeautifulSoup(page_source, features='html.parser')
price_range = get_price_range_fute(soup)
ic(price_range)
mean_price = get_mean_price_fute(soup)
ic(mean_price)
price_ranges.append(price_range)
mean_prices.append(mean_price)
except AttributeError or ConnectionError:
ic()
price_ranges.append('NA')
mean_prices.append('NA')
df['Price Range'] = price_ranges
df['Mean Price'] = mean_prices
return df