Actualiser best_restaurants_scraping.py
This commit is contained in:
parent
2b01ab1af8
commit
b0a9202d8c
|
@ -85,7 +85,7 @@ def scrap(soup):
|
||||||
os.makedirs(newpath)
|
os.makedirs(newpath)
|
||||||
|
|
||||||
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' +
|
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' +
|
||||||
str(datetime.date.today()) + '/' + name + '.html')
|
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
|
||||||
|
|
||||||
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
||||||
for line in soup.prettify():
|
for line in soup.prettify():
|
||||||
|
@ -115,6 +115,7 @@ def get_soup(url):
|
||||||
|
|
||||||
|
|
||||||
def complete_scraping():
|
def complete_scraping():
|
||||||
|
ic()
|
||||||
url = 'https://bestrestaurantsparis.com/fr/explore/'
|
url = 'https://bestrestaurantsparis.com/fr/explore/'
|
||||||
soup = get_soup(url)
|
soup = get_soup(url)
|
||||||
|
|
||||||
|
@ -127,6 +128,19 @@ def complete_scraping():
|
||||||
new_soup = get_soup(new_url)
|
new_soup = get_soup(new_url)
|
||||||
temporary_df = scrap(new_soup)
|
temporary_df = scrap(new_soup)
|
||||||
df = pd.concat([df, temporary_df], ignore_index=True)
|
df = pd.concat([df, temporary_df], ignore_index=True)
|
||||||
|
|
||||||
df = df.dropna(subset='Address')
|
df = df.dropna(subset='Address')
|
||||||
|
|
||||||
|
prices = df['Price']
|
||||||
|
for i in range(len(prices)):
|
||||||
|
if 'href' in prices[i]:
|
||||||
|
ic()
|
||||||
|
df = df.drop(index=[i, i + 1])
|
||||||
|
|
||||||
|
items = df['Price']
|
||||||
|
for i in range(len(items)):
|
||||||
|
if not pd.isna(items[i]):
|
||||||
|
if items[i].replace('€', '') != '':
|
||||||
|
items[i] = items[i].split(' ')[0]
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
Loading…
Reference in New Issue
Block a user