88 lines
3.0 KiB
Python
88 lines
3.0 KiB
Python
from bs4 import BeautifulSoup
|
|
import requests
|
|
import datetime
|
|
import pandas as pd
|
|
from icecream import ic
|
|
|
|
|
|
def get_pages(soup):
|
|
pages = list(set([str(_).split('>')[-3][:-1].split('"')[-1] for _ in soup.find_all('option') if 'Page' in str(_)]))
|
|
return pages
|
|
|
|
|
|
def get_names(soup):
|
|
return [_.text.strip() for _ in soup.find_all('h4')][:-2]
|
|
|
|
|
|
def get_links(soup):
|
|
return [str(_).split('href')[-1][2:].split('>')[0][:-1] for _ in soup.find_all('a') if 'restaurants/' in str(_)]
|
|
|
|
|
|
def get_prices(link):
|
|
base_url = 'https://fr.gaultmillau.com'
|
|
times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir']
|
|
new_page = requests.get(base_url+link)
|
|
new_soup = BeautifulSoup(new_page.text, features='html.parser')
|
|
menu = [_.text.strip() for _ in new_soup.find_all('td') if _.text.strip() not in times]
|
|
prices = {}
|
|
for i in range(0, len(menu), 2):
|
|
prices[menu[i]] = menu[i+1].replace('.', ',')
|
|
return prices
|
|
|
|
|
|
def get_address(soup):
|
|
times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir']
|
|
try:
|
|
adress = [_.text.strip().split(' ')[0] + _.text.strip().split(' ')[-1] for _ in soup.find_all('b')
|
|
if _.text not in times][0]
|
|
return adress.split('\n')[0] + ' ' + adress.split('\n')[-1]
|
|
except IndexError:
|
|
return 'NA'
|
|
|
|
|
|
def scrap(soup):
|
|
names = get_names(soup)
|
|
links = get_links(soup)
|
|
|
|
df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ])
|
|
for name, link in zip(names, links):
|
|
temporary_df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ])
|
|
ic()
|
|
ic(name)
|
|
page = requests.get('https://fr.gaultmillau.com' + link)
|
|
ic(page)
|
|
soup = BeautifulSoup(page.text, features='html.parser')
|
|
temporary_df['Item'] = get_prices(link).keys()
|
|
temporary_df['Price'] = get_prices(link).values()
|
|
address = (get_address(soup).replace('bis', '')
|
|
.replace('bd', 'boulevard').replace(' av ', ' avenue '))
|
|
ic(address)
|
|
|
|
for i in range(len(temporary_df)):
|
|
temporary_df.loc[i, 'Name'] = name
|
|
temporary_df.loc[i, 'Date'] = datetime.date.today()
|
|
temporary_df.loc[i, 'Address'] = address
|
|
|
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
|
return df
|
|
|
|
|
|
def main():
|
|
url = 'https://fr.gaultmillau.com/fr/region/idf/restaurant#search'
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.text, features='html.parser')
|
|
ic(soup)
|
|
pages = get_pages(soup)
|
|
number_of_pages = len(pages)
|
|
df = scrap(soup)
|
|
for i in range(1, number_of_pages):
|
|
ic(i)
|
|
new_url = 'https://fr.gaultmillau.com' + pages[i]
|
|
new_page = requests.get(new_url)
|
|
new_soup = BeautifulSoup(new_page.text, features='html.parser')
|
|
temporary_df = scrap(new_soup)
|
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
|
|
|
df = df.dropna(subset='Address')
|
|
return df
|