diff --git a/gault_et_millau_scraping_v2.py b/gault_et_millau_scraping_v2.py new file mode 100644 index 0000000..581e1d6 --- /dev/null +++ b/gault_et_millau_scraping_v2.py @@ -0,0 +1,91 @@ +from bs4 import BeautifulSoup +import requests +import datetime +import pandas as pd +from icecream import ic + + +def get_pages(soup): + pages = list(set([str(_).split('>')[-3][:-1].split('"')[-1] for _ in soup.find_all('option') if 'Page' in str(_)])) + return pages + + +def get_names(soup): + return [_.text.strip() for _ in soup.find_all('h4')][:-2] + + +def get_links(soup): + return [str(_).split('href')[-1][2:].split('>')[0][:-1] for _ in soup.find_all('a') if 'restaurants/' in str(_)] + + +def get_prices(link): + base_url = 'https://fr.gaultmillau.com' + times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir'] + new_page = requests.get(base_url+link) + new_soup = BeautifulSoup(new_page.text, features='html.parser') + menu = [_.text.strip() for _ in new_soup.find_all('td') if _.text.strip() not in times] + prices = {} + for i in range(0, len(menu), 2): + prices[menu[i]] = menu[i+1].replace('.', ',') + return prices + + +def get_address(soup): + times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir'] + try: + adress = [_.text.strip().split(' ')[0] + _.text.strip().split(' ')[-1] for _ in soup.find_all('b') + if _.text not in times][0] + return adress.split('\n')[0] + ' ' + adress.split('\n')[-1] + except IndexError: + return 'NA' + + +def scrap(soup): + names = get_names(soup) + links = get_links(soup) + + df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ]) + for name, link in zip(names, links): + temporary_df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ]) + ic() + ic(name) + page = requests.get('https://fr.gaultmillau.com' + link) + ic(page) + soup = BeautifulSoup(page.text, features='html.parser') + temporary_df['Item'] = get_prices(link).keys() + temporary_df['Price'] = get_prices(link).values() + address = (get_address(soup).replace('bis', '') + .replace('bd', 'boulevard').replace(' av ', ' avenue ')) + ic(address) + + for i in range(len(temporary_df)): + temporary_df.loc[i, 'Name'] = name + temporary_df.loc[i, 'Date'] = datetime.date.today() + temporary_df.loc[i, 'Address'] = address + + df = pd.concat([df, temporary_df], ignore_index=True) + return df + + +def main(): + url = 'https://fr.gaultmillau.com/fr/region/idf/restaurant#search' + page = requests.get(url) + soup = BeautifulSoup(page.text, features='html.parser') + ic(soup) + pages = get_pages(soup) + number_of_pages = len(pages) + df = scrap(soup) + for i in range(1, number_of_pages): + ic(i) + new_url = 'https://fr.gaultmillau.com' + pages[i] + new_page = requests.get(new_url) + new_soup = BeautifulSoup(new_page.text, features='html.parser') + temporary_df = scrap(new_soup) + df = pd.concat([df, temporary_df], ignore_index=True) + return df + + +ic() +df = main() +ic() +df.to_csv('/Users/oliviermeyer/Desktop/gault_et_milau_test.csv', index=False, header=True, escapechar='\\')