prix_restos/le_fooding_scraping.py

193 lines
6.1 KiB
Python
Raw Permalink Normal View History

2024-06-25 17:28:54 +02:00
from bs4 import BeautifulSoup
import requests
import datetime
import os
import pandas as pd
from icecream import ic
# Number of pages
def get_number_of_pages(soup):
pages = [_ for _ in soup.find_all()]
number_of_pages = []
for _ in pages:
if str(_).find('paris-8246?page=') != -1:
x = str(_).find('paris-8246?page=')
y = str(_)[x + 16: x + 18]
if y[-1].isnumeric():
number_of_pages.append(int(y))
ic(list(set(number_of_pages))[0])
return list(set(number_of_pages))[0]
# Restaurants names
def get_names(soup):
names = [str(_)[41:-5] for _ in soup.find_all('h2')[-12:] if '<span-class=' not in str(_)]
new_names = [
name.replace(' ', '-')
.replace('', '')
.replace('\'', '')
.replace('à', 'a')
.replace('â', 'a')
.replace('', 'a')
.replace('é', 'e')
.replace('è', 'e')
.replace('î', 'i')
.replace('ï', 'i')
.replace('ö', 'o')
.replace('œ', 'oe')
.replace('ü', 'u')
.replace('ç', 'c')
.lower()
for name in names
]
return new_names
# Le Fooding price dictionnary
prices_dictionnary = {0: 'Moins de 15 €',
1: '16 à 35 €',
2: '36 à 50 €',
3: 'Plus de 51 €'}
# Single restaurant price
def get_price(page):
soup = BeautifulSoup(page.text, features='html.parser')
price = set(_ for _ in soup.find_all())
restaurant_price = []
for _ in price:
p = [str(_).find('Moins de 15 €'),
str(_).find('De 16 à 35'),
str(_).find('De 36 à 50 €'),
str(_).find('Plus de 51')]
if sum(p) != -4:
if -1 in p:
for __ in range(len(p)):
if p[__] != -1:
restaurant_price.append(__)
return prices_dictionnary[max(set(restaurant_price), key=restaurant_price.count)]
def get_adress(soup):
address = ''
for _ in soup:
if str(_).find('</a></address>') != -1:
x = str(_).find('</a></address>')
address = str(_)[x - 100:x]
try:
address = address.split('>')[1:][0]
except IndexError:
pass
address = address.lstrip()[:-13]
address = ''.join(address.split('France')[:-1])[:-2]
return address.replace(',', '')
# Restaurants prices of a complete page
def get_prices_and_addresses(names):
prices = []
adresses = []
adress = ''
soup = ''
for name in names:
2024-06-27 11:46:55 +02:00
ic(name)
2024-06-25 17:28:54 +02:00
if not name.isascii():
ic()
x = 'Not ASCII'
prices.append(x)
else:
new_url = 'https://lefooding.com/restaurants/' + name.lower()
new_page = requests.get(new_url)
x = 0
match str(new_page):
case '<Response [200]>':
ic()
x = get_price(new_page)
soup = BeautifulSoup(new_page.text, features='html.parser')
adress = get_adress(soup)
case '<Response [404]>':
ic()
new_url = 'https://lefooding.com/restaurants/restaurant-' + name.lower() + '-paris'
new_page = requests.get(new_url)
match str(new_page):
case '<Response [200]>':
ic()
x = get_price(new_page)
soup = BeautifulSoup(new_page.text, features='html.parser')
adress = get_adress(soup)
case '<Response [404]>':
ic()
x = '<Response [404]>'
for i in range(1, 21):
new_url2 = new_url + '-' + str(i)
new_page = requests.get(new_url2)
if str(new_page) == '<Response [200]>':
x = get_price(new_page)
soup = BeautifulSoup(new_page.text, features='html.parser')
adress = get_adress(soup)
break
prices.append(x)
adresses.append(adress)
if soup != '':
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today())
if not os.path.exists(newpath):
os.makedirs(newpath)
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
with open(html_path, 'wt', encoding='utf-8') as html_file:
for line in soup.prettify():
html_file.write(line)
return prices, adresses
# Scrap function
def scrap_page(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, features='html.parser')
names = get_names(soup)
prices, addresses = get_prices_and_addresses(names)
df = pd.DataFrame(list(zip(names, addresses, prices)), columns=['Name', 'Address', 'Price'])
for i in range(len(df)):
df.loc[i, 'Date'] = datetime.date.today()
return df
# Main function
2024-06-27 11:04:04 +02:00
def complete_scraping():
2024-06-25 17:28:54 +02:00
ic()
# Initialization
starting_url = 'https://lefooding.com/recherche/restaurant/place/paris-8246'
page = requests.get(starting_url)
soup = BeautifulSoup(page.text, features='html.parser')
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
number_of_pages = get_number_of_pages(soup)
temporary_df = scrap_page(starting_url)
df = pd.concat([df, temporary_df], ignore_index=True)
# Loop through the others pages
for i in range(2, number_of_pages+1):
ic(i)
new_url = starting_url + '?page=' + str(i)
temporary_df = scrap_page(new_url)
df = pd.concat([df, temporary_df], ignore_index=True)
2024-06-27 11:04:04 +02:00
df = df[~df['Name'].str.contains('style="display')]
2024-06-28 09:40:20 +02:00
df = df.dropna(subset='Address')
2024-06-25 17:28:54 +02:00
2024-06-27 11:04:04 +02:00
return df