2024-06-25 17:30:41 +02:00
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import requests
|
|
|
|
|
import datetime
|
|
|
|
|
import os
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from icecream import ic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
URLS = ['https://restodeparis.com/coup-de-coeur/',
|
|
|
|
|
'https://restodeparis.com/selection-michelin/',
|
|
|
|
|
'https://restodeparis.com/restaurant/gastronomique/',
|
|
|
|
|
'https://restodeparis.com/restaurant/bistronomique/',
|
|
|
|
|
'https://restodeparis.com/restaurant/francais/',
|
|
|
|
|
'https://restodeparis.com/restaurant/cuisine-du-monde/',
|
|
|
|
|
'https://restodeparis.com/coup-de-coeur/']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_names(soup):
|
|
|
|
|
names = [_ for _ in soup.find_all('a')]
|
|
|
|
|
|
|
|
|
|
names_v2 = []
|
|
|
|
|
for i in range(len(names)):
|
|
|
|
|
if str(names[i]).find('target="_self" title=') != -1:
|
|
|
|
|
names_v2.append(names[i])
|
|
|
|
|
|
|
|
|
|
names_v3 = [str(names_v2[_]) for _ in range(-11, 0, 2)]
|
|
|
|
|
|
|
|
|
|
return [name[:-4].split('>')[-1] for name in names_v3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_all_names(urls):
|
|
|
|
|
names = []
|
|
|
|
|
for url in urls:
|
|
|
|
|
page = requests.get(url)
|
|
|
|
|
soup = BeautifulSoup(page.text, features='html.parser')
|
|
|
|
|
names += get_names(soup)
|
|
|
|
|
|
|
|
|
|
names = [name.replace(' ', '-').replace('’', '').replace('\'', '').replace('à', 'a').replace('â', 'a')
|
|
|
|
|
.replace('ắ', 'a')
|
|
|
|
|
.replace('é', 'e')
|
|
|
|
|
.replace('è', 'e')
|
|
|
|
|
.replace('î', 'i')
|
|
|
|
|
.replace('ï', 'i')
|
|
|
|
|
.replace('ö', 'o')
|
|
|
|
|
.replace('œ', 'oe')
|
|
|
|
|
.replace('ü', 'u')
|
|
|
|
|
.replace('ç', 'c')
|
|
|
|
|
.replace('---', '-')
|
|
|
|
|
.lower()
|
|
|
|
|
for name in names]
|
|
|
|
|
|
|
|
|
|
return sorted(list(set(names)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_restaurants_url(names):
|
|
|
|
|
restaurants_url = []
|
|
|
|
|
for name in names:
|
|
|
|
|
if name == 'cafe-sud':
|
|
|
|
|
url = 'https://restodeparis.com/restaurant/' + 'la-regalade-saint-honore/'
|
|
|
|
|
else:
|
|
|
|
|
url = 'https://restodeparis.com/restaurant/' + name + '/'
|
|
|
|
|
restaurants_url.append(url)
|
|
|
|
|
return restaurants_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_address(soup):
|
|
|
|
|
span = [_.text for _ in soup.find_all('span')]
|
|
|
|
|
adress = ''
|
|
|
|
|
for _ in range(len(span)):
|
|
|
|
|
if span[_].find('Y Aller !') != -1:
|
|
|
|
|
adress = span[_ + 1][1:-4]
|
|
|
|
|
return adress.replace(',', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_menu(soup):
|
|
|
|
|
prices = [str(_.text) for _ in soup.find_all('span')]
|
|
|
|
|
prices = prices[:-23]
|
|
|
|
|
|
|
|
|
|
for price in prices:
|
|
|
|
|
if len(price) == 0 or price == '\n' + '\n':
|
|
|
|
|
prices.remove(price)
|
|
|
|
|
|
|
|
|
|
prices_v2 = []
|
|
|
|
|
for price in prices:
|
|
|
|
|
try:
|
|
|
|
|
if price[0] == '\n':
|
|
|
|
|
prices_v2.append(price[1:price.find('\t')])
|
|
|
|
|
else:
|
|
|
|
|
prices_v2.append(price)
|
|
|
|
|
except IndexError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
index_la_carte = 0
|
|
|
|
|
for i in range(len(prices_v2)):
|
|
|
|
|
if prices_v2[i] == 'La Carte':
|
|
|
|
|
index_la_carte = i+1
|
|
|
|
|
prices_v2 = prices_v2[index_la_carte:]
|
|
|
|
|
|
|
|
|
|
for i in range(1, len(prices_v2)-1):
|
|
|
|
|
if prices_v2[-i] == 'Le Restaurant':
|
|
|
|
|
prices_v2 = prices_v2[:-i]
|
|
|
|
|
|
|
|
|
|
index_last_price = 0
|
|
|
|
|
for i in range(len(prices_v2)):
|
|
|
|
|
if '€' in prices_v2[i]:
|
|
|
|
|
index_last_price = i+1
|
|
|
|
|
if index_last_price != 0:
|
|
|
|
|
prices_v2 = prices_v2[:index_last_price]
|
|
|
|
|
|
|
|
|
|
prices_v2 = [price.replace('.', ',') for price in prices_v2]
|
|
|
|
|
|
|
|
|
|
prices_v3 = {}
|
|
|
|
|
for i in range(0, len(prices_v2), 2):
|
|
|
|
|
prices_v3[prices_v2[i]] = prices_v2[i+1]
|
|
|
|
|
|
|
|
|
|
return prices_v3
|
|
|
|
|
|
|
|
|
|
|
2024-06-27 11:05:10 +02:00
|
|
|
|
def complete_scraping():
|
2024-06-25 17:30:41 +02:00
|
|
|
|
names = get_all_names(URLS)
|
|
|
|
|
restaurants_url = get_restaurants_url(names)
|
|
|
|
|
df = pd.DataFrame(columns=['Name', 'Date', 'Adress', 'Item', 'Price', ])
|
|
|
|
|
|
|
|
|
|
for name, url in zip(names, restaurants_url):
|
|
|
|
|
temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
|
2024-06-27 11:42:32 +02:00
|
|
|
|
ic()
|
2024-06-25 17:30:41 +02:00
|
|
|
|
ic(name)
|
|
|
|
|
page = requests.get(url)
|
|
|
|
|
soup = BeautifulSoup(page.text, features='html.parser')
|
|
|
|
|
temporary_df['Item'] = get_menu(soup).keys()
|
|
|
|
|
temporary_df['Price'] = get_menu(soup).values()
|
|
|
|
|
address = get_address(soup)
|
|
|
|
|
|
|
|
|
|
for i in range(len(temporary_df)):
|
|
|
|
|
temporary_df.loc[i, 'Name'] = name
|
|
|
|
|
temporary_df.loc[i, 'Date'] = datetime.date.today()
|
|
|
|
|
temporary_df.loc[i, 'Address'] = address
|
|
|
|
|
|
|
|
|
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
|
|
|
|
|
|
|
|
|
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today())
|
|
|
|
|
if not os.path.exists(newpath):
|
|
|
|
|
os.makedirs(newpath)
|
|
|
|
|
|
|
|
|
|
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' +
|
|
|
|
|
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
|
|
|
|
|
|
|
|
|
|
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
|
|
|
|
for line in soup.prettify():
|
|
|
|
|
html_file.write(line)
|
|
|
|
|
|
2024-06-28 09:40:51 +02:00
|
|
|
|
df = df.dropna(subset='Address')
|
2024-06-25 17:30:41 +02:00
|
|
|
|
return df
|
|
|
|
|
|