from bs4 import BeautifulSoup import requests import datetime import os import pandas as pd from icecream import ic def get_names(soup): # ic(soup) return [_.text.strip() for _ in soup.find_all('h2') if 'manger à' not in _.text][:-1] def get_number_of_pages(soup): return int([_.text for _ in soup.find_all('ul')][13][-3:-1]) def get_prices(soup): prices = [_.text.strip() for _ in soup.find_all('div') if ' • €' in _.text] prices_v2 = [] for price in prices: x = price.find(' • €') prices_v2.append(price[x+3:x+8].strip()) return [prices_v2[_] for _ in range(0, len(prices_v2[6:])+7, 6)][1:] def get_url(soup): links = [str(_) for _ in soup.find_all('a') if '.html' in str(_) and 'https:' not in str(_)][1:-1] links_v2 = [] for link in links: x = link.find('.html')+5 links_v2.append(link[:x].split('href')[-1][3:]) links_v3 = [] for link in links_v2: if 'https://www.petitfute.com/' + link not in links_v3: links_v3.append('https://www.petitfute.com/' + link) return links_v3 def get_addresses(soup): urls = get_url(soup) addresses = [] for url in urls: # ic() page = requests.get(url) soup2 = BeautifulSoup(page.text, features='html.parser') all_span = [_.text for _ in soup2.find_all('span')] try: address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]). replace(',', ' ').replace(' ', ' ')) except ValueError: address = '' # Deal with word before the street number if not address.split(' ')[0].isdigit(): address_piece_by_piece = address.split(' ') for i in range(len(address_piece_by_piece)): try: if address_piece_by_piece[i][0].isdigit(): address = ' '.join(address_piece_by_piece[i:]) break except IndexError: pass # Deal with double street numbers (keep only the first number) if '-' in address[:5]: address = address.split('-')[0] + ' ' + ' '.join(address.split(' ')[1:]) # Deal with abbreviation address = address.replace('Av.', 'avenue').replace('bis ', '') # ic(address) addresses.append(address) return addresses def scrap(soup): ic() df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',]) df['Name'] = get_names(soup) # ic(len(get_names(soup))) df['Date'] = datetime.date.today() try: df['Address'] = get_addresses(soup) # ic(len(get_addresses(soup))) except ValueError: ic(get_names(soup)) ic(get_addresses(soup)) try: df['Price'] = get_prices(soup) # ic(len(get_prices(soup))) except ValueError: ic(get_names(soup)) ic(get_prices(soup)) for name in df['Name']: newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today()) if not os.path.exists(newpath): os.makedirs(newpath) try: html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') with open(html_path, 'wt', encoding='utf-8') as html_file: for line in soup.prettify(): html_file.write(line) except AttributeError: ic() # print(df) return df def complete_scraping(): url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/' page = requests.get(url) soup = BeautifulSoup(page.text, features='html.parser') df = scrap(soup) number_of_pages = get_number_of_pages(soup) for i in range(number_of_pages+1): # try: new_url = url + '?page=' + str(i) new_page = requests.get(new_url) new_soup = BeautifulSoup(new_page.text, features='html.parser') temporary_df = scrap(new_soup) df = pd.concat([df, temporary_df], ignore_index=True) # print(df.to_string()) # except ValueError: # pass df = df.dropna(subset='Address') prices = df['Price'] for i in range(len(prices)): if not pd.isna(prices[i]): if prices[i].replace('€', '') != '': prices[i] = prices[i].split(' ')[0] return df