from bs4 import BeautifulSoup import requests import datetime import os import pandas as pd from icecream import ic URLS = ['https://restodeparis.com/coup-de-coeur/', 'https://restodeparis.com/selection-michelin/', 'https://restodeparis.com/restaurant/gastronomique/', 'https://restodeparis.com/restaurant/bistronomique/', 'https://restodeparis.com/restaurant/francais/', 'https://restodeparis.com/restaurant/cuisine-du-monde/', 'https://restodeparis.com/coup-de-coeur/'] def get_names(soup): names = [_ for _ in soup.find_all('a')] names_v2 = [] for i in range(len(names)): if str(names[i]).find('target="_self" title=') != -1: names_v2.append(names[i]) names_v3 = [str(names_v2[_]) for _ in range(-11, 0, 2)] return [name[:-4].split('>')[-1] for name in names_v3] def get_all_names(urls): names = [] for url in urls: page = requests.get(url) soup = BeautifulSoup(page.text, features='html.parser') names += get_names(soup) names = [name.replace(' ', '-').replace('’', '').replace('\'', '').replace('à', 'a').replace('â', 'a') .replace('ắ', 'a') .replace('é', 'e') .replace('è', 'e') .replace('î', 'i') .replace('ï', 'i') .replace('ö', 'o') .replace('œ', 'oe') .replace('ü', 'u') .replace('ç', 'c') .replace('---', '-') .lower() for name in names] return sorted(list(set(names))) def get_restaurants_url(names): restaurants_url = [] for name in names: if name == 'cafe-sud': url = 'https://restodeparis.com/restaurant/' + 'la-regalade-saint-honore/' else: url = 'https://restodeparis.com/restaurant/' + name + '/' restaurants_url.append(url) return restaurants_url def get_address(soup): span = [_.text for _ in soup.find_all('span')] adress = '' for _ in range(len(span)): if span[_].find('Y Aller !') != -1: adress = span[_ + 1][1:-4] return adress.replace(',', '') def get_menu(soup): prices = [str(_.text) for _ in soup.find_all('span')] prices = prices[:-23] for price in prices: if len(price) == 0 or price == '\n' + '\n': prices.remove(price) prices_v2 = [] for price in prices: try: if price[0] == '\n': prices_v2.append(price[1:price.find('\t')]) else: prices_v2.append(price) except IndexError: pass index_la_carte = 0 for i in range(len(prices_v2)): if prices_v2[i] == 'La Carte': index_la_carte = i+1 prices_v2 = prices_v2[index_la_carte:] for i in range(1, len(prices_v2)-1): if prices_v2[-i] == 'Le Restaurant': prices_v2 = prices_v2[:-i] index_last_price = 0 for i in range(len(prices_v2)): if '€' in prices_v2[i]: index_last_price = i+1 if index_last_price != 0: prices_v2 = prices_v2[:index_last_price] prices_v2 = [price.replace('.', ',') for price in prices_v2] prices_v3 = {} for i in range(0, len(prices_v2), 2): prices_v3[prices_v2[i]] = prices_v2[i+1] return prices_v3 def complete_scraping(): names = get_all_names(URLS) restaurants_url = get_restaurants_url(names) df = pd.DataFrame(columns=['Name', 'Date', 'Adress', 'Item', 'Price', ]) for name, url in zip(names, restaurants_url): temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ]) ic() ic(name) page = requests.get(url) soup = BeautifulSoup(page.text, features='html.parser') temporary_df['Item'] = get_menu(soup).keys() temporary_df['Price'] = get_menu(soup).values() address = get_address(soup) for i in range(len(temporary_df)): temporary_df.loc[i, 'Name'] = name temporary_df.loc[i, 'Date'] = datetime.date.today() temporary_df.loc[i, 'Address'] = address df = pd.concat([df, temporary_df], ignore_index=True) newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today()) if not os.path.exists(newpath): os.makedirs(newpath) html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') with open(html_path, 'wt', encoding='utf-8') as html_file: for line in soup.prettify(): html_file.write(line) df = df.dropna(subset='Address') return df