diff --git a/best_restaurants_scraping_v2.py b/best_restaurants_scraping_v2.py new file mode 100644 index 0000000..1393cb0 --- /dev/null +++ b/best_restaurants_scraping_v2.py @@ -0,0 +1,136 @@ +from selenium import webdriver +import time +from bs4 import BeautifulSoup +import re +import datetime +import os +import pandas as pd +from icecream import ic + + +def get_names(soup): + return [_.text for _ in soup.find_all('h4')] + + +def get_addresses(soup): + addresses = [_.text for _ in soup.find_all('li')][9:] + + addresses_v2 = [addresses[_+1:_+3] for _ in range(len(addresses)) if + addresses[_] == ' ' or addresses[_] == ''][:-1] + + addresses_v3 = [] + for addresse in addresses_v2: + if addresse[0].strip()[0].isdigit() and not addresse[0].strip()[-1].isdigit(): + addresses_v3.append(addresse[0].strip()) + elif addresse[-1].strip()[0].isdigit() and not addresse[-1].strip()[-1].isdigit(): + addresses_v3.append(addresse[-1].strip()) + else: + addresses_v3.append('NA') + + return addresses_v3 + + +def get_urls(soup): + urls_indexes = [_.start() for _ in re.finditer('href="https://bestrestaurantsparis.com/fr/r', str(soup))] + return ['/'.join(str(soup)[_:_+120][6:].split('/')[:-1]) + '/' for _ in urls_indexes] + + +def get_prices(url): + options = webdriver.ChromeOptions() + options.add_argument("--headless=new") + driver = webdriver.Chrome(options=options) + + driver.get(url) + time.sleep(1) + page_source = driver.page_source + driver.quit() + soup = BeautifulSoup(page_source, features='html.parser') + + prices_indexes = [_.start() for _ in re.finditer('€', str(soup))][:-2] + + prices = [' '.join(str(soup)[_ - 150:_ - 1].split('label')[-1][2:].split('
')).split('<')[0] + except IndexError: + pass + + prices_v2 = {} + for _ in prices: + prices_v2[' '.join(_.split(' ')[:-1])] = _.split(' ')[-1] + ' €' + return prices_v2, soup + + +def get_number_of_pages(soup): + return int([_.text for _ in soup.find_all('li') if _.text.isdigit()][-1]) + + +def scrap(soup): + names = get_names(soup) + addresses = get_addresses(soup) + urls = get_urls(soup) + + df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ]) + for name, address, url in zip(names, addresses, urls): + temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ]) + ic() + ic(name) + # ic(url) + temporary_df['Item'] = get_prices(url)[0].keys() + temporary_df['Price'] = get_prices(url)[0].values() + + newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + str(datetime.date.today()) + if not os.path.exists(newpath): + os.makedirs(newpath) + + html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + + str(datetime.date.today()) + '/' + name + '.html') + + with open(html_path, 'wt', encoding='utf-8') as html_file: + for line in soup.prettify(): + html_file.write(line) + for i in range(len(temporary_df)): + temporary_df.loc[i, 'Name'] = name + temporary_df.loc[i, 'Address'] = address + temporary_df.loc[i, 'Date'] = datetime.date.today() + + df = pd.concat([df, temporary_df], ignore_index=True) + return df + + +def get_soup(url): + options = webdriver.ChromeOptions() + options.add_argument("--headless=new") + driver = webdriver.Chrome(options=options) + + driver.get(url) + time.sleep(1) + + page_source = driver.page_source + driver.quit() + soup = BeautifulSoup(page_source, features='html.parser') + + return soup + + +def main(): + url = 'https://bestrestaurantsparis.com/fr/explore/' + soup = get_soup(url) + + number_of_pages = get_number_of_pages(soup) + df = scrap(soup) + for i in range(2, number_of_pages+1): + ic(i) + new_url = url + '?pg=' + str(i) + '&sort=latest' + ic(new_url) + new_soup = get_soup(new_url) + temporary_df = scrap(new_soup) + df = pd.concat([df, temporary_df], ignore_index=True) + return df + + +ic() +df = main() +df.to_csv('/Users/oliviermeyer/Desktop/best_restaurants_siret_test.csv', index=False, header=True, escapechar='\\') +ic()