prix_restos/le_fooding_scraping.py

193 lines
6.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from bs4 import BeautifulSoup
import requests
import datetime
import os
import pandas as pd
from icecream import ic
# Number of pages
def get_number_of_pages(soup):
pages = [_ for _ in soup.find_all()]
number_of_pages = []
for _ in pages:
if str(_).find('paris-8246?page=') != -1:
x = str(_).find('paris-8246?page=')
y = str(_)[x + 16: x + 18]
if y[-1].isnumeric():
number_of_pages.append(int(y))
ic(list(set(number_of_pages))[0])
return list(set(number_of_pages))[0]
# Restaurants names
def get_names(soup):
names = [str(_)[41:-5] for _ in soup.find_all('h2')[-12:] if '<span-class=' not in str(_)]
new_names = [
name.replace(' ', '-')
.replace('', '')
.replace('\'', '')
.replace('à', 'a')
.replace('â', 'a')
.replace('', 'a')
.replace('é', 'e')
.replace('è', 'e')
.replace('î', 'i')
.replace('ï', 'i')
.replace('ö', 'o')
.replace('œ', 'oe')
.replace('ü', 'u')
.replace('ç', 'c')
.lower()
for name in names
]
return new_names
# Le Fooding price dictionnary
prices_dictionnary = {0: 'Moins de 15 €',
1: '16 à 35 €',
2: '36 à 50 €',
3: 'Plus de 51 €'}
# Single restaurant price
def get_price(page):
soup = BeautifulSoup(page.text, features='html.parser')
price = set(_ for _ in soup.find_all())
restaurant_price = []
for _ in price:
p = [str(_).find('Moins de 15 €'),
str(_).find('De 16 à 35'),
str(_).find('De 36 à 50 €'),
str(_).find('Plus de 51')]
if sum(p) != -4:
if -1 in p:
for __ in range(len(p)):
if p[__] != -1:
restaurant_price.append(__)
return prices_dictionnary[max(set(restaurant_price), key=restaurant_price.count)]
def get_adress(soup):
address = ''
for _ in soup:
if str(_).find('</a></address>') != -1:
x = str(_).find('</a></address>')
address = str(_)[x - 100:x]
try:
address = address.split('>')[1:][0]
except IndexError:
pass
address = address.lstrip()[:-13]
address = ''.join(address.split('France')[:-1])[:-2]
return address.replace(',', '')
# Restaurants prices of a complete page
def get_prices_and_addresses(names):
prices = []
adresses = []
adress = ''
soup = ''
for name in names:
ic(name)
if not name.isascii():
ic()
x = 'Not ASCII'
prices.append(x)
else:
new_url = 'https://lefooding.com/restaurants/' + name.lower()
new_page = requests.get(new_url)
x = 0
match str(new_page):
case '<Response [200]>':
ic()
x = get_price(new_page)
soup = BeautifulSoup(new_page.text, features='html.parser')
adress = get_adress(soup)
case '<Response [404]>':
ic()
new_url = 'https://lefooding.com/restaurants/restaurant-' + name.lower() + '-paris'
new_page = requests.get(new_url)
match str(new_page):
case '<Response [200]>':
ic()
x = get_price(new_page)
soup = BeautifulSoup(new_page.text, features='html.parser')
adress = get_adress(soup)
case '<Response [404]>':
ic()
x = '<Response [404]>'
for i in range(1, 21):
new_url2 = new_url + '-' + str(i)
new_page = requests.get(new_url2)
if str(new_page) == '<Response [200]>':
x = get_price(new_page)
soup = BeautifulSoup(new_page.text, features='html.parser')
adress = get_adress(soup)
break
prices.append(x)
adresses.append(adress)
if soup != '':
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today())
if not os.path.exists(newpath):
os.makedirs(newpath)
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
with open(html_path, 'wt', encoding='utf-8') as html_file:
for line in soup.prettify():
html_file.write(line)
return prices, adresses
# Scrap function
def scrap_page(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, features='html.parser')
names = get_names(soup)
prices, addresses = get_prices_and_addresses(names)
df = pd.DataFrame(list(zip(names, addresses, prices)), columns=['Name', 'Address', 'Price'])
for i in range(len(df)):
df.loc[i, 'Date'] = datetime.date.today()
return df
# Main function
def complete_scraping():
ic()
# Initialization
starting_url = 'https://lefooding.com/recherche/restaurant/place/paris-8246'
page = requests.get(starting_url)
soup = BeautifulSoup(page.text, features='html.parser')
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
number_of_pages = get_number_of_pages(soup)
temporary_df = scrap_page(starting_url)
df = pd.concat([df, temporary_df], ignore_index=True)
# Loop through the others pages
for i in range(2, number_of_pages+1):
ic(i)
new_url = starting_url + '?page=' + str(i)
temporary_df = scrap_page(new_url)
df = pd.concat([df, temporary_df], ignore_index=True)
df = df[~df['Name'].str.contains('style="display')]
df = df.dropna(subset='Address')
return df