2024-06-25 17:34:46 +02:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import requests
|
|
|
|
import datetime
|
2024-06-27 11:04:42 +02:00
|
|
|
import os
|
2024-06-25 17:34:46 +02:00
|
|
|
import pandas as pd
|
|
|
|
from icecream import ic
|
|
|
|
|
|
|
|
|
|
|
|
def get_names(soup):
|
2024-07-15 14:43:18 +02:00
|
|
|
# ic(soup)
|
|
|
|
return [_.text.strip() for _ in soup.find_all('h2') if 'manger à' not in _.text][:-1]
|
2024-06-25 17:34:46 +02:00
|
|
|
|
|
|
|
|
|
|
|
def get_number_of_pages(soup):
|
|
|
|
return int([_.text for _ in soup.find_all('ul')][13][-3:-1])
|
|
|
|
|
|
|
|
|
|
|
|
def get_prices(soup):
|
|
|
|
prices = [_.text.strip() for _ in soup.find_all('div') if ' • €' in _.text]
|
|
|
|
prices_v2 = []
|
|
|
|
for price in prices:
|
|
|
|
x = price.find(' • €')
|
|
|
|
prices_v2.append(price[x+3:x+8].strip())
|
|
|
|
return [prices_v2[_] for _ in range(0, len(prices_v2[6:])+7, 6)][1:]
|
|
|
|
|
|
|
|
|
|
|
|
def get_url(soup):
|
|
|
|
links = [str(_) for _ in soup.find_all('a') if '.html' in str(_) and 'https:' not in str(_)][1:-1]
|
|
|
|
links_v2 = []
|
|
|
|
for link in links:
|
|
|
|
x = link.find('.html')+5
|
|
|
|
links_v2.append(link[:x].split('href')[-1][3:])
|
|
|
|
|
|
|
|
links_v3 = []
|
|
|
|
for link in links_v2:
|
|
|
|
if 'https://www.petitfute.com/' + link not in links_v3:
|
|
|
|
links_v3.append('https://www.petitfute.com/' + link)
|
|
|
|
|
|
|
|
return links_v3
|
|
|
|
|
|
|
|
|
|
|
|
def get_addresses(soup):
|
|
|
|
urls = get_url(soup)
|
|
|
|
addresses = []
|
|
|
|
for url in urls:
|
2024-07-15 14:43:18 +02:00
|
|
|
# ic()
|
2024-06-25 17:34:46 +02:00
|
|
|
page = requests.get(url)
|
|
|
|
soup2 = BeautifulSoup(page.text, features='html.parser')
|
|
|
|
|
|
|
|
all_span = [_.text for _ in soup2.find_all('span')]
|
2024-07-15 14:43:18 +02:00
|
|
|
try:
|
|
|
|
address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]).
|
|
|
|
replace(',', ' ').replace(' ', ' '))
|
|
|
|
except ValueError:
|
|
|
|
address = ''
|
2024-06-25 17:34:46 +02:00
|
|
|
|
|
|
|
# Deal with word before the street number
|
|
|
|
if not address.split(' ')[0].isdigit():
|
|
|
|
address_piece_by_piece = address.split(' ')
|
|
|
|
for i in range(len(address_piece_by_piece)):
|
|
|
|
try:
|
|
|
|
if address_piece_by_piece[i][0].isdigit():
|
|
|
|
address = ' '.join(address_piece_by_piece[i:])
|
|
|
|
break
|
|
|
|
except IndexError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
# Deal with double street numbers (keep only the first number)
|
|
|
|
if '-' in address[:5]:
|
|
|
|
address = address.split('-')[0] + ' ' + ' '.join(address.split(' ')[1:])
|
|
|
|
|
|
|
|
# Deal with abbreviation
|
|
|
|
address = address.replace('Av.', 'avenue').replace('bis ', '')
|
|
|
|
|
2024-07-15 14:43:18 +02:00
|
|
|
# ic(address)
|
2024-06-25 17:34:46 +02:00
|
|
|
addresses.append(address)
|
|
|
|
return addresses
|
|
|
|
|
|
|
|
|
|
|
|
def scrap(soup):
|
2024-07-15 14:43:18 +02:00
|
|
|
ic()
|
2024-06-27 11:04:42 +02:00
|
|
|
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
|
2024-06-25 17:34:46 +02:00
|
|
|
df['Name'] = get_names(soup)
|
2024-07-15 14:43:18 +02:00
|
|
|
# ic(len(get_names(soup)))
|
2024-06-25 17:34:46 +02:00
|
|
|
df['Date'] = datetime.date.today()
|
2024-07-15 14:43:18 +02:00
|
|
|
try:
|
|
|
|
df['Address'] = get_addresses(soup)
|
|
|
|
# ic(len(get_addresses(soup)))
|
|
|
|
except ValueError:
|
|
|
|
ic(get_names(soup))
|
|
|
|
ic(get_addresses(soup))
|
|
|
|
try:
|
|
|
|
df['Price'] = get_prices(soup)
|
|
|
|
# ic(len(get_prices(soup)))
|
|
|
|
except ValueError:
|
|
|
|
ic(get_names(soup))
|
|
|
|
ic(get_prices(soup))
|
2024-06-25 17:34:46 +02:00
|
|
|
|
2024-06-27 11:04:42 +02:00
|
|
|
for name in df['Name']:
|
|
|
|
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today())
|
|
|
|
if not os.path.exists(newpath):
|
|
|
|
os.makedirs(newpath)
|
|
|
|
|
2024-07-15 14:43:18 +02:00
|
|
|
try:
|
|
|
|
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' +
|
|
|
|
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
|
|
|
|
|
|
|
|
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
|
|
|
for line in soup.prettify():
|
|
|
|
html_file.write(line)
|
|
|
|
except AttributeError:
|
|
|
|
ic()
|
|
|
|
# print(df)
|
2024-06-25 17:34:46 +02:00
|
|
|
return df
|
|
|
|
|
|
|
|
|
2024-06-27 11:04:42 +02:00
|
|
|
def complete_scraping():
|
2024-06-25 17:34:46 +02:00
|
|
|
url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/'
|
|
|
|
page = requests.get(url)
|
|
|
|
soup = BeautifulSoup(page.text, features='html.parser')
|
|
|
|
|
|
|
|
df = scrap(soup)
|
|
|
|
|
|
|
|
number_of_pages = get_number_of_pages(soup)
|
|
|
|
for i in range(number_of_pages+1):
|
2024-07-15 14:43:18 +02:00
|
|
|
# try:
|
|
|
|
new_url = url + '?page=' + str(i)
|
|
|
|
new_page = requests.get(new_url)
|
|
|
|
new_soup = BeautifulSoup(new_page.text, features='html.parser')
|
|
|
|
temporary_df = scrap(new_soup)
|
|
|
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
|
|
|
# print(df.to_string())
|
|
|
|
# except ValueError:
|
|
|
|
# pass
|
2024-06-25 17:34:46 +02:00
|
|
|
|
2024-06-28 09:40:35 +02:00
|
|
|
df = df.dropna(subset='Address')
|
2024-06-25 17:34:46 +02:00
|
|
|
|
2024-07-15 14:43:18 +02:00
|
|
|
prices = df['Price']
|
|
|
|
for i in range(len(prices)):
|
|
|
|
if not pd.isna(prices[i]):
|
|
|
|
if prices[i].replace('€', '') != '':
|
|
|
|
prices[i] = prices[i].split(' ')[0]
|
|
|
|
|
|
|
|
return df
|