Téléverser les fichiers vers "/"
This commit is contained in:
commit
5c576928da
204
le_fooding_scraping.py
Normal file
204
le_fooding_scraping.py
Normal file
|
@ -0,0 +1,204 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import datetime
|
||||
import os
|
||||
import pandas as pd
|
||||
from icecream import ic
|
||||
|
||||
|
||||
# Number of pages
|
||||
def get_number_of_pages(soup):
|
||||
pages = [_ for _ in soup.find_all()]
|
||||
number_of_pages = []
|
||||
|
||||
for _ in pages:
|
||||
if str(_).find('paris-8246?page=') != -1:
|
||||
x = str(_).find('paris-8246?page=')
|
||||
y = str(_)[x + 16: x + 18]
|
||||
if y[-1].isnumeric():
|
||||
number_of_pages.append(int(y))
|
||||
ic(list(set(number_of_pages))[0])
|
||||
return list(set(number_of_pages))[0]
|
||||
|
||||
|
||||
# Restaurants names
|
||||
def get_names(soup):
|
||||
names = [str(_)[41:-5] for _ in soup.find_all('h2')[-12:] if '<span-class=' not in str(_)]
|
||||
new_names = [
|
||||
name.replace(' ', '-')
|
||||
.replace('’', '')
|
||||
.replace('\'', '')
|
||||
.replace('à', 'a')
|
||||
.replace('â', 'a')
|
||||
.replace('ắ', 'a')
|
||||
.replace('é', 'e')
|
||||
.replace('è', 'e')
|
||||
.replace('î', 'i')
|
||||
.replace('ï', 'i')
|
||||
.replace('ö', 'o')
|
||||
.replace('œ', 'oe')
|
||||
.replace('ü', 'u')
|
||||
.replace('ç', 'c')
|
||||
.lower()
|
||||
for name in names
|
||||
]
|
||||
return new_names
|
||||
|
||||
|
||||
# Le Fooding price dictionnary
|
||||
prices_dictionnary = {0: 'Moins de 15 €',
|
||||
1: '16 à 35 €',
|
||||
2: '36 à 50 €',
|
||||
3: 'Plus de 51 €'}
|
||||
|
||||
|
||||
# Single restaurant price
|
||||
def get_price(page):
|
||||
soup = BeautifulSoup(page.text, features='html.parser')
|
||||
|
||||
price = set(_ for _ in soup.find_all())
|
||||
restaurant_price = []
|
||||
|
||||
for _ in price:
|
||||
p = [str(_).find('Moins de 15 €'),
|
||||
str(_).find('De 16 à 35'),
|
||||
str(_).find('De 36 à 50 €'),
|
||||
str(_).find('Plus de 51')]
|
||||
|
||||
if sum(p) != -4:
|
||||
if -1 in p:
|
||||
for __ in range(len(p)):
|
||||
if p[__] != -1:
|
||||
restaurant_price.append(__)
|
||||
return prices_dictionnary[max(set(restaurant_price), key=restaurant_price.count)]
|
||||
|
||||
|
||||
def get_adress(soup):
|
||||
address = ''
|
||||
for _ in soup:
|
||||
if str(_).find('</a></address>') != -1:
|
||||
x = str(_).find('</a></address>')
|
||||
address = str(_)[x - 100:x]
|
||||
try:
|
||||
address = address.split('>')[1:][0]
|
||||
except IndexError:
|
||||
pass
|
||||
address = address.lstrip()[:-13]
|
||||
address = ''.join(address.split('France')[:-1])[:-2]
|
||||
|
||||
return address.replace(',', '')
|
||||
|
||||
|
||||
# Restaurants prices of a complete page
|
||||
def get_prices_and_addresses(names):
|
||||
prices = []
|
||||
adresses = []
|
||||
adress = ''
|
||||
soup = ''
|
||||
for name in names:
|
||||
if not name.isascii():
|
||||
ic()
|
||||
x = 'Not ASCII'
|
||||
prices.append(x)
|
||||
else:
|
||||
new_url = 'https://lefooding.com/restaurants/' + name.lower()
|
||||
ic(new_url)
|
||||
new_page = requests.get(new_url)
|
||||
x = 0
|
||||
match str(new_page):
|
||||
case '<Response [200]>':
|
||||
ic()
|
||||
x = get_price(new_page)
|
||||
soup = BeautifulSoup(new_page.text, features='html.parser')
|
||||
adress = get_adress(soup)
|
||||
ic(adress)
|
||||
case '<Response [404]>':
|
||||
ic()
|
||||
new_url = 'https://lefooding.com/restaurants/restaurant-' + name.lower() + '-paris'
|
||||
new_page = requests.get(new_url)
|
||||
ic(new_url)
|
||||
match str(new_page):
|
||||
case '<Response [200]>':
|
||||
ic()
|
||||
x = get_price(new_page)
|
||||
soup = BeautifulSoup(new_page.text, features='html.parser')
|
||||
adress = get_adress(soup)
|
||||
ic(adress)
|
||||
case '<Response [404]>':
|
||||
ic()
|
||||
x = '<Response [404]>'
|
||||
for i in range(1, 21):
|
||||
ic()
|
||||
new_url2 = new_url + '-' + str(i)
|
||||
new_page = requests.get(new_url2)
|
||||
if str(new_page) == '<Response [200]>':
|
||||
ic()
|
||||
x = get_price(new_page)
|
||||
soup = BeautifulSoup(new_page.text, features='html.parser')
|
||||
adress = get_adress(soup)
|
||||
ic(adress)
|
||||
break
|
||||
|
||||
prices.append(x)
|
||||
adresses.append(adress)
|
||||
|
||||
if soup != '':
|
||||
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today())
|
||||
if not os.path.exists(newpath):
|
||||
os.makedirs(newpath)
|
||||
|
||||
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
|
||||
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
|
||||
|
||||
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
||||
for line in soup.prettify():
|
||||
html_file.write(line)
|
||||
|
||||
return prices, adresses
|
||||
|
||||
|
||||
# Scrap function
|
||||
def scrap_page(url):
|
||||
page = requests.get(url)
|
||||
soup = BeautifulSoup(page.text, features='html.parser')
|
||||
|
||||
names = get_names(soup)
|
||||
prices, addresses = get_prices_and_addresses(names)
|
||||
ic(prices, addresses)
|
||||
|
||||
df = pd.DataFrame(list(zip(names, addresses, prices)), columns=['Name', 'Address', 'Price'])
|
||||
for i in range(len(df)):
|
||||
df.loc[i, 'Date'] = datetime.date.today()
|
||||
print(df.to_string())
|
||||
return df
|
||||
|
||||
|
||||
# Main function
|
||||
def main():
|
||||
ic()
|
||||
# Initialization
|
||||
starting_url = 'https://lefooding.com/recherche/restaurant/place/paris-8246'
|
||||
page = requests.get(starting_url)
|
||||
soup = BeautifulSoup(page.text, features='html.parser')
|
||||
|
||||
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
|
||||
number_of_pages = get_number_of_pages(soup)
|
||||
|
||||
temporary_df = scrap_page(starting_url)
|
||||
df = pd.concat([df, temporary_df], ignore_index=True)
|
||||
print(df.to_string())
|
||||
|
||||
# Loop through the others pages
|
||||
for i in range(2, number_of_pages+1):
|
||||
ic(i)
|
||||
new_url = starting_url + '?page=' + str(i)
|
||||
temporary_df = scrap_page(new_url)
|
||||
df = pd.concat([df, temporary_df], ignore_index=True)
|
||||
print(df.to_string())
|
||||
|
||||
return df
|
||||
|
||||
|
||||
df = main()
|
||||
|
||||
df.to_csv('/Users/oliviermeyer/Desktop/le_fooding_test.csv', index=False, header=True, escapechar='\\')
|
Loading…
Reference in New Issue
Block a user