Actualiser best_restaurants_scraping.py

This commit is contained in:
Olivier MEYER 2024-07-15 14:42:28 +02:00
parent 2b01ab1af8
commit b0a9202d8c

View File

@ -85,7 +85,7 @@ def scrap(soup):
os.makedirs(newpath) os.makedirs(newpath)
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' +
str(datetime.date.today()) + '/' + name + '.html') str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
with open(html_path, 'wt', encoding='utf-8') as html_file: with open(html_path, 'wt', encoding='utf-8') as html_file:
for line in soup.prettify(): for line in soup.prettify():
@ -115,6 +115,7 @@ def get_soup(url):
def complete_scraping(): def complete_scraping():
ic()
url = 'https://bestrestaurantsparis.com/fr/explore/' url = 'https://bestrestaurantsparis.com/fr/explore/'
soup = get_soup(url) soup = get_soup(url)
@ -129,4 +130,17 @@ def complete_scraping():
df = pd.concat([df, temporary_df], ignore_index=True) df = pd.concat([df, temporary_df], ignore_index=True)
df = df.dropna(subset='Address') df = df.dropna(subset='Address')
prices = df['Price']
for i in range(len(prices)):
if 'href' in prices[i]:
ic()
df = df.drop(index=[i, i + 1])
items = df['Price']
for i in range(len(items)):
if not pd.isna(items[i]):
if items[i].replace('', '') != '':
items[i] = items[i].split(' ')[0]
return df return df