prix_restos/best_restaurants_scraping.py

141 lines
4.3 KiB
Python

from selenium import webdriver
import time
from bs4 import BeautifulSoup
import re
import datetime
import os
import pandas as pd
from icecream import ic
def get_names(soup):
return [_.text for _ in soup.find_all('h4')]
def get_addresses(soup):
addresses = [_.text for _ in soup.find_all('li')][9:]
addresses_v2 = [addresses[_+1:_+3] for _ in range(len(addresses)) if
addresses[_] == ' ' or addresses[_] == ''][:-1]
addresses_v3 = []
for addresse in addresses_v2:
if addresse[0].strip()[0].isdigit() and not addresse[0].strip()[-1].isdigit():
addresses_v3.append(addresse[0].strip())
elif addresse[-1].strip()[0].isdigit() and not addresse[-1].strip()[-1].isdigit():
addresses_v3.append(addresse[-1].strip())
else:
addresses_v3.append('NA')
return addresses_v3
def get_urls(soup):
urls_indexes = [_.start() for _ in re.finditer('href="https://bestrestaurantsparis.com/fr/r', str(soup))]
return ['/'.join(str(soup)[_:_+120][6:].split('/')[:-1]) + '/' for _ in urls_indexes]
def get_prices(url):
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(1)
page_source = driver.page_source
driver.quit()
soup = BeautifulSoup(page_source, features='html.parser')
prices_indexes = [_.start() for _ in re.finditer('', str(soup))][:-2]
prices = [' '.join(str(soup)[_ - 150:_ - 1].split('label')[-1][2:].split('</div><div class="menu-price">'))
for _ in prices_indexes]
try:
if prices[0][:10] == 'Prix moyen':
prices[0] = ' '.join(prices[0].split('</div><div class="value">')).split('<')[0]
except IndexError:
pass
prices_v2 = {}
for _ in prices:
prices_v2[' '.join(_.split(' ')[:-1])] = _.split(' ')[-1] + ''
return prices_v2, soup
def get_number_of_pages(soup):
return int([_.text for _ in soup.find_all('li') if _.text.isdigit()][-1])
def scrap(soup):
names = get_names(soup)
addresses = get_addresses(soup)
urls = get_urls(soup)
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
for name, address, url in zip(names, addresses, urls):
temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
ic()
ic(name)
# ic(url)
temporary_df['Item'] = get_prices(url)[0].keys()
temporary_df['Price'] = get_prices(url)[0].values()
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + str(datetime.date.today())
if not os.path.exists(newpath):
os.makedirs(newpath)
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' +
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
with open(html_path, 'wt', encoding='utf-8') as html_file:
for line in soup.prettify():
html_file.write(line)
for i in range(len(temporary_df)):
temporary_df.loc[i, 'Name'] = name
temporary_df.loc[i, 'Address'] = address
temporary_df.loc[i, 'Date'] = datetime.date.today()
df = pd.concat([df, temporary_df], ignore_index=True)
return df
def get_soup(url):
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(1)
page_source = driver.page_source
driver.quit()
soup = BeautifulSoup(page_source, features='html.parser')
return soup
def complete_scraping():
ic()
url = 'https://bestrestaurantsparis.com/fr/explore/'
soup = get_soup(url)
number_of_pages = get_number_of_pages(soup)
df = scrap(soup)
for i in range(2, number_of_pages+1):
ic(i)
new_url = url + '?pg=' + str(i) + '&sort=latest'
ic(new_url)
new_soup = get_soup(new_url)
temporary_df = scrap(new_soup)
df = pd.concat([df, temporary_df], ignore_index=True)
df = df.dropna(subset='Address')
prices = df['Price']
for i in range(len(prices)):
if 'href' in prices[i]:
ic()
df = df.drop(index=[i, i + 1])
return df