from bs4 import BeautifulSoup import requests import datetime import pandas as pd from icecream import ic def get_pages(soup): pages = list(set([str(_).split('>')[-3][:-1].split('"')[-1] for _ in soup.find_all('option') if 'Page' in str(_)])) return pages def get_names(soup): return [_.text.strip() for _ in soup.find_all('h4')][:-2] def get_links(soup): return [str(_).split('href')[-1][2:].split('>')[0][:-1] for _ in soup.find_all('a') if 'restaurants/' in str(_)] def get_prices(link): base_url = 'https://fr.gaultmillau.com' times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir'] new_page = requests.get(base_url+link) new_soup = BeautifulSoup(new_page.text, features='html.parser') menu = [_.text.strip() for _ in new_soup.find_all('td') if _.text.strip() not in times] prices = {} for i in range(0, len(menu), 2): prices[menu[i]] = menu[i+1].replace('.', ',') return prices def get_address(soup): times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir'] try: adress = [_.text.strip().split(' ')[0] + _.text.strip().split(' ')[-1] for _ in soup.find_all('b') if _.text not in times][0] return adress.split('\n')[0] + ' ' + adress.split('\n')[-1] except IndexError: return 'NA' def scrap(soup): names = get_names(soup) links = get_links(soup) df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ]) for name, link in zip(names, links): temporary_df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ]) ic() ic(name) page = requests.get('https://fr.gaultmillau.com' + link) ic(page) soup = BeautifulSoup(page.text, features='html.parser') temporary_df['Item'] = get_prices(link).keys() temporary_df['Price'] = get_prices(link).values() address = (get_address(soup).replace('bis', '') .replace('bd', 'boulevard').replace(' av ', ' avenue ')) ic(address) for i in range(len(temporary_df)): temporary_df.loc[i, 'Name'] = name temporary_df.loc[i, 'Date'] = datetime.date.today() temporary_df.loc[i, 'Address'] = address df = pd.concat([df, temporary_df], ignore_index=True) return df def main(): url = 'https://fr.gaultmillau.com/fr/region/idf/restaurant#search' page = requests.get(url) soup = BeautifulSoup(page.text, features='html.parser') ic(soup) pages = get_pages(soup) number_of_pages = len(pages) df = scrap(soup) for i in range(1, number_of_pages): ic(i) new_url = 'https://fr.gaultmillau.com' + pages[i] new_page = requests.get(new_url) new_soup = BeautifulSoup(new_page.text, features='html.parser') temporary_df = scrap(new_soup) df = pd.concat([df, temporary_df], ignore_index=True) df = df.dropna(subset='Address') return df