2024-06-25 17:28:54 +02:00
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
import requests
|
|
|
|
|
import datetime
|
|
|
|
|
import os
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from icecream import ic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Number of pages
|
|
|
|
|
def get_number_of_pages(soup):
|
|
|
|
|
pages = [_ for _ in soup.find_all()]
|
|
|
|
|
number_of_pages = []
|
|
|
|
|
|
|
|
|
|
for _ in pages:
|
|
|
|
|
if str(_).find('paris-8246?page=') != -1:
|
|
|
|
|
x = str(_).find('paris-8246?page=')
|
|
|
|
|
y = str(_)[x + 16: x + 18]
|
|
|
|
|
if y[-1].isnumeric():
|
|
|
|
|
number_of_pages.append(int(y))
|
|
|
|
|
ic(list(set(number_of_pages))[0])
|
|
|
|
|
return list(set(number_of_pages))[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Restaurants names
|
|
|
|
|
def get_names(soup):
|
|
|
|
|
names = [str(_)[41:-5] for _ in soup.find_all('h2')[-12:] if '<span-class=' not in str(_)]
|
|
|
|
|
new_names = [
|
|
|
|
|
name.replace(' ', '-')
|
|
|
|
|
.replace('’', '')
|
|
|
|
|
.replace('\'', '')
|
|
|
|
|
.replace('à', 'a')
|
|
|
|
|
.replace('â', 'a')
|
|
|
|
|
.replace('ắ', 'a')
|
|
|
|
|
.replace('é', 'e')
|
|
|
|
|
.replace('è', 'e')
|
|
|
|
|
.replace('î', 'i')
|
|
|
|
|
.replace('ï', 'i')
|
|
|
|
|
.replace('ö', 'o')
|
|
|
|
|
.replace('œ', 'oe')
|
|
|
|
|
.replace('ü', 'u')
|
|
|
|
|
.replace('ç', 'c')
|
|
|
|
|
.lower()
|
|
|
|
|
for name in names
|
|
|
|
|
]
|
|
|
|
|
return new_names
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Le Fooding price dictionnary
|
|
|
|
|
prices_dictionnary = {0: 'Moins de 15 €',
|
|
|
|
|
1: '16 à 35 €',
|
|
|
|
|
2: '36 à 50 €',
|
|
|
|
|
3: 'Plus de 51 €'}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Single restaurant price
|
|
|
|
|
def get_price(page):
|
|
|
|
|
soup = BeautifulSoup(page.text, features='html.parser')
|
|
|
|
|
|
|
|
|
|
price = set(_ for _ in soup.find_all())
|
|
|
|
|
restaurant_price = []
|
|
|
|
|
|
|
|
|
|
for _ in price:
|
|
|
|
|
p = [str(_).find('Moins de 15 €'),
|
|
|
|
|
str(_).find('De 16 à 35'),
|
|
|
|
|
str(_).find('De 36 à 50 €'),
|
|
|
|
|
str(_).find('Plus de 51')]
|
|
|
|
|
|
|
|
|
|
if sum(p) != -4:
|
|
|
|
|
if -1 in p:
|
|
|
|
|
for __ in range(len(p)):
|
|
|
|
|
if p[__] != -1:
|
|
|
|
|
restaurant_price.append(__)
|
|
|
|
|
return prices_dictionnary[max(set(restaurant_price), key=restaurant_price.count)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_adress(soup):
|
|
|
|
|
address = ''
|
|
|
|
|
for _ in soup:
|
|
|
|
|
if str(_).find('</a></address>') != -1:
|
|
|
|
|
x = str(_).find('</a></address>')
|
|
|
|
|
address = str(_)[x - 100:x]
|
|
|
|
|
try:
|
|
|
|
|
address = address.split('>')[1:][0]
|
|
|
|
|
except IndexError:
|
|
|
|
|
pass
|
|
|
|
|
address = address.lstrip()[:-13]
|
|
|
|
|
address = ''.join(address.split('France')[:-1])[:-2]
|
|
|
|
|
|
|
|
|
|
return address.replace(',', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Restaurants prices of a complete page
|
|
|
|
|
def get_prices_and_addresses(names):
|
|
|
|
|
prices = []
|
|
|
|
|
adresses = []
|
|
|
|
|
adress = ''
|
|
|
|
|
soup = ''
|
|
|
|
|
for name in names:
|
2024-06-27 11:46:55 +02:00
|
|
|
|
ic(name)
|
2024-06-25 17:28:54 +02:00
|
|
|
|
if not name.isascii():
|
|
|
|
|
ic()
|
|
|
|
|
x = 'Not ASCII'
|
|
|
|
|
prices.append(x)
|
|
|
|
|
else:
|
|
|
|
|
new_url = 'https://lefooding.com/restaurants/' + name.lower()
|
|
|
|
|
new_page = requests.get(new_url)
|
|
|
|
|
x = 0
|
|
|
|
|
match str(new_page):
|
|
|
|
|
case '<Response [200]>':
|
|
|
|
|
ic()
|
|
|
|
|
x = get_price(new_page)
|
|
|
|
|
soup = BeautifulSoup(new_page.text, features='html.parser')
|
|
|
|
|
adress = get_adress(soup)
|
|
|
|
|
case '<Response [404]>':
|
|
|
|
|
ic()
|
|
|
|
|
new_url = 'https://lefooding.com/restaurants/restaurant-' + name.lower() + '-paris'
|
|
|
|
|
new_page = requests.get(new_url)
|
|
|
|
|
match str(new_page):
|
|
|
|
|
case '<Response [200]>':
|
|
|
|
|
ic()
|
|
|
|
|
x = get_price(new_page)
|
|
|
|
|
soup = BeautifulSoup(new_page.text, features='html.parser')
|
|
|
|
|
adress = get_adress(soup)
|
|
|
|
|
case '<Response [404]>':
|
|
|
|
|
ic()
|
|
|
|
|
x = '<Response [404]>'
|
|
|
|
|
for i in range(1, 21):
|
|
|
|
|
new_url2 = new_url + '-' + str(i)
|
|
|
|
|
new_page = requests.get(new_url2)
|
|
|
|
|
if str(new_page) == '<Response [200]>':
|
|
|
|
|
x = get_price(new_page)
|
|
|
|
|
soup = BeautifulSoup(new_page.text, features='html.parser')
|
|
|
|
|
adress = get_adress(soup)
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
prices.append(x)
|
|
|
|
|
adresses.append(adress)
|
|
|
|
|
|
|
|
|
|
if soup != '':
|
|
|
|
|
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today())
|
|
|
|
|
if not os.path.exists(newpath):
|
|
|
|
|
os.makedirs(newpath)
|
|
|
|
|
|
|
|
|
|
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
|
|
|
|
|
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
|
|
|
|
|
|
|
|
|
|
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
|
|
|
|
for line in soup.prettify():
|
|
|
|
|
html_file.write(line)
|
|
|
|
|
|
|
|
|
|
return prices, adresses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Scrap function
|
|
|
|
|
def scrap_page(url):
|
|
|
|
|
page = requests.get(url)
|
|
|
|
|
soup = BeautifulSoup(page.text, features='html.parser')
|
|
|
|
|
|
|
|
|
|
names = get_names(soup)
|
|
|
|
|
prices, addresses = get_prices_and_addresses(names)
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(list(zip(names, addresses, prices)), columns=['Name', 'Address', 'Price'])
|
|
|
|
|
for i in range(len(df)):
|
|
|
|
|
df.loc[i, 'Date'] = datetime.date.today()
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Main function
|
2024-06-27 11:04:04 +02:00
|
|
|
|
def complete_scraping():
|
2024-06-25 17:28:54 +02:00
|
|
|
|
ic()
|
|
|
|
|
# Initialization
|
|
|
|
|
starting_url = 'https://lefooding.com/recherche/restaurant/place/paris-8246'
|
|
|
|
|
page = requests.get(starting_url)
|
|
|
|
|
soup = BeautifulSoup(page.text, features='html.parser')
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
|
|
|
|
|
number_of_pages = get_number_of_pages(soup)
|
|
|
|
|
|
|
|
|
|
temporary_df = scrap_page(starting_url)
|
|
|
|
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
|
|
|
|
|
|
|
|
|
# Loop through the others pages
|
|
|
|
|
for i in range(2, number_of_pages+1):
|
|
|
|
|
ic(i)
|
|
|
|
|
new_url = starting_url + '?page=' + str(i)
|
|
|
|
|
temporary_df = scrap_page(new_url)
|
|
|
|
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
|
|
|
|
|
2024-06-27 11:04:04 +02:00
|
|
|
|
df = df[~df['Name'].str.contains('style="display')]
|
2024-06-28 09:40:20 +02:00
|
|
|
|
df = df.dropna(subset='Address')
|
2024-06-25 17:28:54 +02:00
|
|
|
|
|
2024-06-27 11:04:04 +02:00
|
|
|
|
return df
|