103 lines
3.0 KiB
Python
103 lines
3.0 KiB
Python
|
from selenium import webdriver
|
|||
|
from bs4 import BeautifulSoup
|
|||
|
import time
|
|||
|
from icecream import ic
|
|||
|
|
|||
|
|
|||
|
# Google scraping for restaurants in Le Fooding
|
|||
|
def google_scrap_fooding(df):
|
|||
|
price_ranges = []
|
|||
|
|
|||
|
for i in range(len(df['Name'])):
|
|||
|
name = df.iloc[i]['Name']
|
|||
|
address = df.iloc[i]['Address']
|
|||
|
|
|||
|
options = webdriver.ChromeOptions()
|
|||
|
options.add_argument("---headless=True")
|
|||
|
driver = webdriver.Chrome(options=options)
|
|||
|
base_url = 'https://www.google.com/search?hl=en&q='
|
|||
|
|
|||
|
url = base_url + str(name).replace(' ', '-') + str(address).replace(' ', '-')
|
|||
|
ic()
|
|||
|
ic(url)
|
|||
|
|
|||
|
driver.get(url)
|
|||
|
time.sleep(0.1)
|
|||
|
|
|||
|
page_source = driver.page_source
|
|||
|
driver.quit()
|
|||
|
soup = BeautifulSoup(page_source, features='html.parser')
|
|||
|
|
|||
|
all_span = [_.text for _ in soup.find_all('span')]
|
|||
|
|
|||
|
price_range = ''.join([_ for _ in all_span if '€' in _ and '–' in _]).split('...')[-1][1:].split('€')[0]
|
|||
|
ic(price_range)
|
|||
|
|
|||
|
if len(price_range) < 8:
|
|||
|
price_ranges.append(price_range)
|
|||
|
else:
|
|||
|
price_ranges.append('')
|
|||
|
|
|||
|
df['Price Range'] = price_ranges
|
|||
|
|
|||
|
return df
|
|||
|
|
|||
|
|
|||
|
def get_price_range_fute(soup):
|
|||
|
price_range = (''.join([_.text for _ in soup.find_all('span') if 'Reported by' in _.text]).
|
|||
|
strip().split('R')[0].split('€')[-1])
|
|||
|
|
|||
|
if price_range == '':
|
|||
|
all_span = [_.text for _ in soup.find_all('span')]
|
|||
|
price_range = [_ for _ in all_span if '· Prix. de' in _]
|
|||
|
price_range = ''.join(price_range).split('€')[0].split('· Prix. de')[-1].strip()
|
|||
|
|
|||
|
return price_range
|
|||
|
|
|||
|
|
|||
|
# Google scraping for restaurants in Petit Fute
|
|||
|
def get_mean_price_fute(soup):
|
|||
|
mean_price = [_.text for _ in soup.find_all('span') if '(Les prix ont été fournis par le restaurant)' in _.text]
|
|||
|
mean_price = ''.join(mean_price).split('€')[0].split('Price range: ')[-1].strip()
|
|||
|
return mean_price
|
|||
|
|
|||
|
|
|||
|
def google_scrap_fute(df):
|
|||
|
price_ranges = []
|
|||
|
mean_prices = []
|
|||
|
|
|||
|
ic()
|
|||
|
for name, address in zip(df['Name'], df['Address']):
|
|||
|
ic()
|
|||
|
options = webdriver.ChromeOptions()
|
|||
|
options.add_argument("---headless=True")
|
|||
|
driver = webdriver.Chrome(options=options)
|
|||
|
base_url = 'https://www.google.com/search?hl=en&q='
|
|||
|
try:
|
|||
|
url = base_url + name.replace(' ', '-') + address.replace(' ', '-')
|
|||
|
ic(url)
|
|||
|
|
|||
|
driver.get(url)
|
|||
|
time.sleep(0.1)
|
|||
|
|
|||
|
page_source = driver.page_source
|
|||
|
driver.quit()
|
|||
|
soup = BeautifulSoup(page_source, features='html.parser')
|
|||
|
|
|||
|
price_range = get_price_range_fute(soup)
|
|||
|
ic(price_range)
|
|||
|
mean_price = get_mean_price_fute(soup)
|
|||
|
ic(mean_price)
|
|||
|
price_ranges.append(price_range)
|
|||
|
mean_prices.append(mean_price)
|
|||
|
|
|||
|
except AttributeError or ConnectionError:
|
|||
|
ic()
|
|||
|
price_ranges.append('NA')
|
|||
|
mean_prices.append('NA')
|
|||
|
|
|||
|
df['Price Range'] = price_ranges
|
|||
|
df['Mean Price'] = mean_prices
|
|||
|
|
|||
|
return df
|