Téléverser les fichiers vers "/"
This commit is contained in:
parent
5c576928da
commit
7b7bf32f29
136
best_restaurants_scraping_v2.py
Normal file
136
best_restaurants_scraping_v2.py
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
import time
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from icecream import ic
|
||||||
|
|
||||||
|
|
||||||
|
def get_names(soup):
|
||||||
|
return [_.text for _ in soup.find_all('h4')]
|
||||||
|
|
||||||
|
|
||||||
|
def get_addresses(soup):
|
||||||
|
addresses = [_.text for _ in soup.find_all('li')][9:]
|
||||||
|
|
||||||
|
addresses_v2 = [addresses[_+1:_+3] for _ in range(len(addresses)) if
|
||||||
|
addresses[_] == ' ' or addresses[_] == ''][:-1]
|
||||||
|
|
||||||
|
addresses_v3 = []
|
||||||
|
for addresse in addresses_v2:
|
||||||
|
if addresse[0].strip()[0].isdigit() and not addresse[0].strip()[-1].isdigit():
|
||||||
|
addresses_v3.append(addresse[0].strip())
|
||||||
|
elif addresse[-1].strip()[0].isdigit() and not addresse[-1].strip()[-1].isdigit():
|
||||||
|
addresses_v3.append(addresse[-1].strip())
|
||||||
|
else:
|
||||||
|
addresses_v3.append('NA')
|
||||||
|
|
||||||
|
return addresses_v3
|
||||||
|
|
||||||
|
|
||||||
|
def get_urls(soup):
|
||||||
|
urls_indexes = [_.start() for _ in re.finditer('href="https://bestrestaurantsparis.com/fr/r', str(soup))]
|
||||||
|
return ['/'.join(str(soup)[_:_+120][6:].split('/')[:-1]) + '/' for _ in urls_indexes]
|
||||||
|
|
||||||
|
|
||||||
|
def get_prices(url):
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
options.add_argument("--headless=new")
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(1)
|
||||||
|
page_source = driver.page_source
|
||||||
|
driver.quit()
|
||||||
|
soup = BeautifulSoup(page_source, features='html.parser')
|
||||||
|
|
||||||
|
prices_indexes = [_.start() for _ in re.finditer('€', str(soup))][:-2]
|
||||||
|
|
||||||
|
prices = [' '.join(str(soup)[_ - 150:_ - 1].split('label')[-1][2:].split('</div><div class="menu-price">'))
|
||||||
|
for _ in prices_indexes]
|
||||||
|
try:
|
||||||
|
if prices[0][:10] == 'Prix moyen':
|
||||||
|
prices[0] = ' '.join(prices[0].split('</div><div class="value">')).split('<')[0]
|
||||||
|
except IndexError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
prices_v2 = {}
|
||||||
|
for _ in prices:
|
||||||
|
prices_v2[' '.join(_.split(' ')[:-1])] = _.split(' ')[-1] + ' €'
|
||||||
|
return prices_v2, soup
|
||||||
|
|
||||||
|
|
||||||
|
def get_number_of_pages(soup):
|
||||||
|
return int([_.text for _ in soup.find_all('li') if _.text.isdigit()][-1])
|
||||||
|
|
||||||
|
|
||||||
|
def scrap(soup):
|
||||||
|
names = get_names(soup)
|
||||||
|
addresses = get_addresses(soup)
|
||||||
|
urls = get_urls(soup)
|
||||||
|
|
||||||
|
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
|
||||||
|
for name, address, url in zip(names, addresses, urls):
|
||||||
|
temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
|
||||||
|
ic()
|
||||||
|
ic(name)
|
||||||
|
# ic(url)
|
||||||
|
temporary_df['Item'] = get_prices(url)[0].keys()
|
||||||
|
temporary_df['Price'] = get_prices(url)[0].values()
|
||||||
|
|
||||||
|
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + str(datetime.date.today())
|
||||||
|
if not os.path.exists(newpath):
|
||||||
|
os.makedirs(newpath)
|
||||||
|
|
||||||
|
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' +
|
||||||
|
str(datetime.date.today()) + '/' + name + '.html')
|
||||||
|
|
||||||
|
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
||||||
|
for line in soup.prettify():
|
||||||
|
html_file.write(line)
|
||||||
|
for i in range(len(temporary_df)):
|
||||||
|
temporary_df.loc[i, 'Name'] = name
|
||||||
|
temporary_df.loc[i, 'Address'] = address
|
||||||
|
temporary_df.loc[i, 'Date'] = datetime.date.today()
|
||||||
|
|
||||||
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def get_soup(url):
|
||||||
|
options = webdriver.ChromeOptions()
|
||||||
|
options.add_argument("--headless=new")
|
||||||
|
driver = webdriver.Chrome(options=options)
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
page_source = driver.page_source
|
||||||
|
driver.quit()
|
||||||
|
soup = BeautifulSoup(page_source, features='html.parser')
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
url = 'https://bestrestaurantsparis.com/fr/explore/'
|
||||||
|
soup = get_soup(url)
|
||||||
|
|
||||||
|
number_of_pages = get_number_of_pages(soup)
|
||||||
|
df = scrap(soup)
|
||||||
|
for i in range(2, number_of_pages+1):
|
||||||
|
ic(i)
|
||||||
|
new_url = url + '?pg=' + str(i) + '&sort=latest'
|
||||||
|
ic(new_url)
|
||||||
|
new_soup = get_soup(new_url)
|
||||||
|
temporary_df = scrap(new_soup)
|
||||||
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
ic()
|
||||||
|
df = main()
|
||||||
|
df.to_csv('/Users/oliviermeyer/Desktop/best_restaurants_siret_test.csv', index=False, header=True, escapechar='\\')
|
||||||
|
ic()
|
Loading…
Reference in New Issue
Block a user