prix_restos/gault_et_millau_scraping.py

88 lines
3.0 KiB
Python
Raw Permalink Normal View History

2024-06-25 17:31:12 +02:00
from bs4 import BeautifulSoup
import requests
import datetime
import pandas as pd
from icecream import ic
def get_pages(soup):
pages = list(set([str(_).split('>')[-3][:-1].split('"')[-1] for _ in soup.find_all('option') if 'Page' in str(_)]))
return pages
def get_names(soup):
return [_.text.strip() for _ in soup.find_all('h4')][:-2]
def get_links(soup):
return [str(_).split('href')[-1][2:].split('>')[0][:-1] for _ in soup.find_all('a') if 'restaurants/' in str(_)]
def get_prices(link):
base_url = 'https://fr.gaultmillau.com'
times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir']
new_page = requests.get(base_url+link)
new_soup = BeautifulSoup(new_page.text, features='html.parser')
menu = [_.text.strip() for _ in new_soup.find_all('td') if _.text.strip() not in times]
prices = {}
for i in range(0, len(menu), 2):
prices[menu[i]] = menu[i+1].replace('.', ',')
return prices
def get_address(soup):
times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir']
try:
adress = [_.text.strip().split(' ')[0] + _.text.strip().split(' ')[-1] for _ in soup.find_all('b')
if _.text not in times][0]
return adress.split('\n')[0] + ' ' + adress.split('\n')[-1]
except IndexError:
return 'NA'
def scrap(soup):
names = get_names(soup)
links = get_links(soup)
df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ])
for name, link in zip(names, links):
temporary_df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ])
ic()
ic(name)
page = requests.get('https://fr.gaultmillau.com' + link)
ic(page)
soup = BeautifulSoup(page.text, features='html.parser')
temporary_df['Item'] = get_prices(link).keys()
temporary_df['Price'] = get_prices(link).values()
address = (get_address(soup).replace('bis', '')
.replace('bd', 'boulevard').replace(' av ', ' avenue '))
ic(address)
for i in range(len(temporary_df)):
temporary_df.loc[i, 'Name'] = name
temporary_df.loc[i, 'Date'] = datetime.date.today()
temporary_df.loc[i, 'Address'] = address
df = pd.concat([df, temporary_df], ignore_index=True)
return df
def main():
url = 'https://fr.gaultmillau.com/fr/region/idf/restaurant#search'
page = requests.get(url)
soup = BeautifulSoup(page.text, features='html.parser')
ic(soup)
pages = get_pages(soup)
number_of_pages = len(pages)
df = scrap(soup)
for i in range(1, number_of_pages):
ic(i)
new_url = 'https://fr.gaultmillau.com' + pages[i]
new_page = requests.get(new_url)
new_soup = BeautifulSoup(new_page.text, features='html.parser')
temporary_df = scrap(new_soup)
df = pd.concat([df, temporary_df], ignore_index=True)
2024-06-28 09:39:57 +02:00
df = df.dropna(subset='Address')
2024-06-25 17:31:12 +02:00
return df