from bs4 import BeautifulSoup import requests import datetime import os import pandas as pd from icecream import ic # Number of pages def get_number_of_pages(soup): pages = [_ for _ in soup.find_all()] number_of_pages = [] for _ in pages: if str(_).find('paris-8246?page=') != -1: x = str(_).find('paris-8246?page=') y = str(_)[x + 16: x + 18] if y[-1].isnumeric(): number_of_pages.append(int(y)) ic(list(set(number_of_pages))[0]) return list(set(number_of_pages))[0] # Restaurants names def get_names(soup): names = [str(_)[41:-5] for _ in soup.find_all('h2')[-12:] if '') != -1: x = str(_).find('') address = str(_)[x - 100:x] try: address = address.split('>')[1:][0] except IndexError: pass address = address.lstrip()[:-13] address = ''.join(address.split('France')[:-1])[:-2] return address.replace(',', '') # Restaurants prices of a complete page def get_prices_and_addresses(names): prices = [] adresses = [] adress = '' soup = '' for name in names: ic(name) if not name.isascii(): ic() x = 'Not ASCII' prices.append(x) else: new_url = 'https://lefooding.com/restaurants/' + name.lower() new_page = requests.get(new_url) x = 0 match str(new_page): case '': ic() x = get_price(new_page) soup = BeautifulSoup(new_page.text, features='html.parser') adress = get_adress(soup) case '': ic() new_url = 'https://lefooding.com/restaurants/restaurant-' + name.lower() + '-paris' new_page = requests.get(new_url) match str(new_page): case '': ic() x = get_price(new_page) soup = BeautifulSoup(new_page.text, features='html.parser') adress = get_adress(soup) case '': ic() x = '' for i in range(1, 21): new_url2 = new_url + '-' + str(i) new_page = requests.get(new_url2) if str(new_page) == '': x = get_price(new_page) soup = BeautifulSoup(new_page.text, features='html.parser') adress = get_adress(soup) break prices.append(x) adresses.append(adress) if soup != '': newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today()) if not os.path.exists(newpath): os.makedirs(newpath) html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') with open(html_path, 'wt', encoding='utf-8') as html_file: for line in soup.prettify(): html_file.write(line) return prices, adresses # Scrap function def scrap_page(url): page = requests.get(url) soup = BeautifulSoup(page.text, features='html.parser') names = get_names(soup) prices, addresses = get_prices_and_addresses(names) df = pd.DataFrame(list(zip(names, addresses, prices)), columns=['Name', 'Address', 'Price']) for i in range(len(df)): df.loc[i, 'Date'] = datetime.date.today() return df # Main function def complete_scraping(): ic() # Initialization starting_url = 'https://lefooding.com/recherche/restaurant/place/paris-8246' page = requests.get(starting_url) soup = BeautifulSoup(page.text, features='html.parser') df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',]) number_of_pages = get_number_of_pages(soup) temporary_df = scrap_page(starting_url) df = pd.concat([df, temporary_df], ignore_index=True) # Loop through the others pages for i in range(2, number_of_pages+1): ic(i) new_url = starting_url + '?page=' + str(i) temporary_df = scrap_page(new_url) df = pd.concat([df, temporary_df], ignore_index=True) df = df[~df['Name'].str.contains('style="display')] df = df.dropna(subset='Address') return df