from selenium import webdriver import time from bs4 import BeautifulSoup import re import datetime import os import pandas as pd from icecream import ic def get_names(soup): return [_.text for _ in soup.find_all('h4')] def get_addresses(soup): addresses = [_.text for _ in soup.find_all('li')][9:] addresses_v2 = [addresses[_+1:_+3] for _ in range(len(addresses)) if addresses[_] == ' ' or addresses[_] == ''][:-1] addresses_v3 = [] for addresse in addresses_v2: if addresse[0].strip()[0].isdigit() and not addresse[0].strip()[-1].isdigit(): addresses_v3.append(addresse[0].strip()) elif addresse[-1].strip()[0].isdigit() and not addresse[-1].strip()[-1].isdigit(): addresses_v3.append(addresse[-1].strip()) else: addresses_v3.append('NA') return addresses_v3 def get_urls(soup): urls_indexes = [_.start() for _ in re.finditer('href="https://bestrestaurantsparis.com/fr/r', str(soup))] return ['/'.join(str(soup)[_:_+120][6:].split('/')[:-1]) + '/' for _ in urls_indexes] def get_prices(url): options = webdriver.ChromeOptions() options.add_argument("--headless=new") driver = webdriver.Chrome(options=options) driver.get(url) time.sleep(1) page_source = driver.page_source driver.quit() soup = BeautifulSoup(page_source, features='html.parser') prices_indexes = [_.start() for _ in re.finditer('€', str(soup))][:-2] prices = [' '.join(str(soup)[_ - 150:_ - 1].split('label')[-1][2:].split('
')).split('<')[0] except IndexError: pass prices_v2 = {} for _ in prices: prices_v2[' '.join(_.split(' ')[:-1])] = _.split(' ')[-1] + ' €' return prices_v2, soup def get_number_of_pages(soup): return int([_.text for _ in soup.find_all('li') if _.text.isdigit()][-1]) def scrap(soup): names = get_names(soup) addresses = get_addresses(soup) urls = get_urls(soup) df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ]) for name, address, url in zip(names, addresses, urls): temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ]) ic() ic(name) # ic(url) temporary_df['Item'] = get_prices(url)[0].keys() temporary_df['Price'] = get_prices(url)[0].values() newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + str(datetime.date.today()) if not os.path.exists(newpath): os.makedirs(newpath) html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') with open(html_path, 'wt', encoding='utf-8') as html_file: for line in soup.prettify(): html_file.write(line) for i in range(len(temporary_df)): temporary_df.loc[i, 'Name'] = name temporary_df.loc[i, 'Address'] = address temporary_df.loc[i, 'Date'] = datetime.date.today() df = pd.concat([df, temporary_df], ignore_index=True) return df def get_soup(url): options = webdriver.ChromeOptions() options.add_argument("--headless=new") driver = webdriver.Chrome(options=options) driver.get(url) time.sleep(1) page_source = driver.page_source driver.quit() soup = BeautifulSoup(page_source, features='html.parser') return soup def complete_scraping(): ic() url = 'https://bestrestaurantsparis.com/fr/explore/' soup = get_soup(url) number_of_pages = get_number_of_pages(soup) df = scrap(soup) for i in range(2, number_of_pages+1): ic(i) new_url = url + '?pg=' + str(i) + '&sort=latest' ic(new_url) new_soup = get_soup(new_url) temporary_df = scrap(new_soup) df = pd.concat([df, temporary_df], ignore_index=True) df = df.dropna(subset='Address') prices = df['Price'] for i in range(len(prices)): if 'href' in prices[i]: ic() df = df.drop(index=[i, i + 1]) return df