Ajouter google_scraping.py

This commit is contained in:
Olivier MEYER 2024-06-27 11:06:25 +02:00
parent 244abd9c6a
commit 32ef2b7fb9

102
google_scraping.py Normal file
View File

@ -0,0 +1,102 @@
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from icecream import ic
# Google scraping for restaurants in Le Fooding
def google_scrap_fooding(df):
price_ranges = []
for i in range(len(df['Name'])):
name = df.iloc[i]['Name']
address = df.iloc[i]['Address']
options = webdriver.ChromeOptions()
options.add_argument("---headless=True")
driver = webdriver.Chrome(options=options)
base_url = 'https://www.google.com/search?hl=en&q='
url = base_url + str(name).replace(' ', '-') + str(address).replace(' ', '-')
ic()
ic(url)
driver.get(url)
time.sleep(0.1)
page_source = driver.page_source
driver.quit()
soup = BeautifulSoup(page_source, features='html.parser')
all_span = [_.text for _ in soup.find_all('span')]
price_range = ''.join([_ for _ in all_span if '' in _ and '' in _]).split('...')[-1][1:].split('')[0]
ic(price_range)
if len(price_range) < 8:
price_ranges.append(price_range)
else:
price_ranges.append('')
df['Price Range'] = price_ranges
return df
def get_price_range_fute(soup):
price_range = (''.join([_.text for _ in soup.find_all('span') if 'Reported by' in _.text]).
strip().split('R')[0].split('')[-1])
if price_range == '':
all_span = [_.text for _ in soup.find_all('span')]
price_range = [_ for _ in all_span if '· Prix. de' in _]
price_range = ''.join(price_range).split('')[0].split('· Prix. de')[-1].strip()
return price_range
# Google scraping for restaurants in Petit Fute
def get_mean_price_fute(soup):
mean_price = [_.text for _ in soup.find_all('span') if '(Les prix ont été fournis par le restaurant)' in _.text]
mean_price = ''.join(mean_price).split('')[0].split('Price range: ')[-1].strip()
return mean_price
def google_scrap_fute(df):
price_ranges = []
mean_prices = []
ic()
for name, address in zip(df['Name'], df['Address']):
ic()
options = webdriver.ChromeOptions()
options.add_argument("---headless=True")
driver = webdriver.Chrome(options=options)
base_url = 'https://www.google.com/search?hl=en&q='
try:
url = base_url + name.replace(' ', '-') + address.replace(' ', '-')
ic(url)
driver.get(url)
time.sleep(0.1)
page_source = driver.page_source
driver.quit()
soup = BeautifulSoup(page_source, features='html.parser')
price_range = get_price_range_fute(soup)
ic(price_range)
mean_price = get_mean_price_fute(soup)
ic(mean_price)
price_ranges.append(price_range)
mean_prices.append(mean_price)
except AttributeError or ConnectionError:
ic()
price_ranges.append('NA')
mean_prices.append('NA')
df['Price Range'] = price_ranges
df['Mean Price'] = mean_prices
return df