Ajouter google_scraping.py
This commit is contained in:
parent
244abd9c6a
commit
32ef2b7fb9
102
google_scraping.py
Normal file
102
google_scraping.py
Normal file
|
@ -0,0 +1,102 @@
|
|||
from selenium import webdriver
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
from icecream import ic
|
||||
|
||||
|
||||
# Google scraping for restaurants in Le Fooding
|
||||
def google_scrap_fooding(df):
|
||||
price_ranges = []
|
||||
|
||||
for i in range(len(df['Name'])):
|
||||
name = df.iloc[i]['Name']
|
||||
address = df.iloc[i]['Address']
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("---headless=True")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
base_url = 'https://www.google.com/search?hl=en&q='
|
||||
|
||||
url = base_url + str(name).replace(' ', '-') + str(address).replace(' ', '-')
|
||||
ic()
|
||||
ic(url)
|
||||
|
||||
driver.get(url)
|
||||
time.sleep(0.1)
|
||||
|
||||
page_source = driver.page_source
|
||||
driver.quit()
|
||||
soup = BeautifulSoup(page_source, features='html.parser')
|
||||
|
||||
all_span = [_.text for _ in soup.find_all('span')]
|
||||
|
||||
price_range = ''.join([_ for _ in all_span if '€' in _ and '–' in _]).split('...')[-1][1:].split('€')[0]
|
||||
ic(price_range)
|
||||
|
||||
if len(price_range) < 8:
|
||||
price_ranges.append(price_range)
|
||||
else:
|
||||
price_ranges.append('')
|
||||
|
||||
df['Price Range'] = price_ranges
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def get_price_range_fute(soup):
|
||||
price_range = (''.join([_.text for _ in soup.find_all('span') if 'Reported by' in _.text]).
|
||||
strip().split('R')[0].split('€')[-1])
|
||||
|
||||
if price_range == '':
|
||||
all_span = [_.text for _ in soup.find_all('span')]
|
||||
price_range = [_ for _ in all_span if '· Prix. de' in _]
|
||||
price_range = ''.join(price_range).split('€')[0].split('· Prix. de')[-1].strip()
|
||||
|
||||
return price_range
|
||||
|
||||
|
||||
# Google scraping for restaurants in Petit Fute
|
||||
def get_mean_price_fute(soup):
|
||||
mean_price = [_.text for _ in soup.find_all('span') if '(Les prix ont été fournis par le restaurant)' in _.text]
|
||||
mean_price = ''.join(mean_price).split('€')[0].split('Price range: ')[-1].strip()
|
||||
return mean_price
|
||||
|
||||
|
||||
def google_scrap_fute(df):
|
||||
price_ranges = []
|
||||
mean_prices = []
|
||||
|
||||
ic()
|
||||
for name, address in zip(df['Name'], df['Address']):
|
||||
ic()
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("---headless=True")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
base_url = 'https://www.google.com/search?hl=en&q='
|
||||
try:
|
||||
url = base_url + name.replace(' ', '-') + address.replace(' ', '-')
|
||||
ic(url)
|
||||
|
||||
driver.get(url)
|
||||
time.sleep(0.1)
|
||||
|
||||
page_source = driver.page_source
|
||||
driver.quit()
|
||||
soup = BeautifulSoup(page_source, features='html.parser')
|
||||
|
||||
price_range = get_price_range_fute(soup)
|
||||
ic(price_range)
|
||||
mean_price = get_mean_price_fute(soup)
|
||||
ic(mean_price)
|
||||
price_ranges.append(price_range)
|
||||
mean_prices.append(mean_price)
|
||||
|
||||
except AttributeError or ConnectionError:
|
||||
ic()
|
||||
price_ranges.append('NA')
|
||||
mean_prices.append('NA')
|
||||
|
||||
df['Price Range'] = price_ranges
|
||||
df['Mean Price'] = mean_prices
|
||||
|
||||
return df
|
Loading…
Reference in New Issue
Block a user