prix_restos/resto_de_paris_scraping.py

155 lines
4.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from bs4 import BeautifulSoup
import requests
import datetime
import os
import pandas as pd
from icecream import ic
URLS = ['https://restodeparis.com/coup-de-coeur/',
'https://restodeparis.com/selection-michelin/',
'https://restodeparis.com/restaurant/gastronomique/',
'https://restodeparis.com/restaurant/bistronomique/',
'https://restodeparis.com/restaurant/francais/',
'https://restodeparis.com/restaurant/cuisine-du-monde/',
'https://restodeparis.com/coup-de-coeur/']
def get_names(soup):
names = [_ for _ in soup.find_all('a')]
names_v2 = []
for i in range(len(names)):
if str(names[i]).find('target="_self" title=') != -1:
names_v2.append(names[i])
names_v3 = [str(names_v2[_]) for _ in range(-11, 0, 2)]
return [name[:-4].split('>')[-1] for name in names_v3]
def get_all_names(urls):
names = []
for url in urls:
page = requests.get(url)
soup = BeautifulSoup(page.text, features='html.parser')
names += get_names(soup)
names = [name.replace(' ', '-').replace('', '').replace('\'', '').replace('à', 'a').replace('â', 'a')
.replace('', 'a')
.replace('é', 'e')
.replace('è', 'e')
.replace('î', 'i')
.replace('ï', 'i')
.replace('ö', 'o')
.replace('œ', 'oe')
.replace('ü', 'u')
.replace('ç', 'c')
.replace('---', '-')
.lower()
for name in names]
return sorted(list(set(names)))
def get_restaurants_url(names):
restaurants_url = []
for name in names:
if name == 'cafe-sud':
url = 'https://restodeparis.com/restaurant/' + 'la-regalade-saint-honore/'
else:
url = 'https://restodeparis.com/restaurant/' + name + '/'
restaurants_url.append(url)
return restaurants_url
def get_address(soup):
span = [_.text for _ in soup.find_all('span')]
adress = ''
for _ in range(len(span)):
if span[_].find('Y Aller !') != -1:
adress = span[_ + 1][1:-4]
return adress.replace(',', '')
def get_menu(soup):
prices = [str(_.text) for _ in soup.find_all('span')]
prices = prices[:-23]
for price in prices:
if len(price) == 0 or price == '\n' + '\n':
prices.remove(price)
prices_v2 = []
for price in prices:
try:
if price[0] == '\n':
prices_v2.append(price[1:price.find('\t')])
else:
prices_v2.append(price)
except IndexError:
pass
index_la_carte = 0
for i in range(len(prices_v2)):
if prices_v2[i] == 'La Carte':
index_la_carte = i+1
prices_v2 = prices_v2[index_la_carte:]
for i in range(1, len(prices_v2)-1):
if prices_v2[-i] == 'Le Restaurant':
prices_v2 = prices_v2[:-i]
index_last_price = 0
for i in range(len(prices_v2)):
if '' in prices_v2[i]:
index_last_price = i+1
if index_last_price != 0:
prices_v2 = prices_v2[:index_last_price]
prices_v2 = [price.replace('.', ',') for price in prices_v2]
prices_v3 = {}
for i in range(0, len(prices_v2), 2):
prices_v3[prices_v2[i]] = prices_v2[i+1]
return prices_v3
def complete_scraping():
names = get_all_names(URLS)
restaurants_url = get_restaurants_url(names)
df = pd.DataFrame(columns=['Name', 'Date', 'Adress', 'Item', 'Price', ])
for name, url in zip(names, restaurants_url):
temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
ic()
ic(name)
page = requests.get(url)
soup = BeautifulSoup(page.text, features='html.parser')
temporary_df['Item'] = get_menu(soup).keys()
temporary_df['Price'] = get_menu(soup).values()
address = get_address(soup)
for i in range(len(temporary_df)):
temporary_df.loc[i, 'Name'] = name
temporary_df.loc[i, 'Date'] = datetime.date.today()
temporary_df.loc[i, 'Address'] = address
df = pd.concat([df, temporary_df], ignore_index=True)
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today())
if not os.path.exists(newpath):
os.makedirs(newpath)
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' +
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
with open(html_path, 'wt', encoding='utf-8') as html_file:
for line in soup.prettify():
html_file.write(line)
df = df.dropna(subset='Address')
return df