diff --git a/best_restaurants_scraping.py b/best_restaurants_scraping.py index 25678f2..3c39f54 100644 --- a/best_restaurants_scraping.py +++ b/best_restaurants_scraping.py @@ -85,7 +85,7 @@ def scrap(soup): os.makedirs(newpath) html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + - str(datetime.date.today()) + '/' + name + '.html') + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') with open(html_path, 'wt', encoding='utf-8') as html_file: for line in soup.prettify(): @@ -115,6 +115,7 @@ def get_soup(url): def complete_scraping(): + ic() url = 'https://bestrestaurantsparis.com/fr/explore/' soup = get_soup(url) @@ -127,6 +128,19 @@ def complete_scraping(): new_soup = get_soup(new_url) temporary_df = scrap(new_soup) df = pd.concat([df, temporary_df], ignore_index=True) - + df = df.dropna(subset='Address') + + prices = df['Price'] + for i in range(len(prices)): + if 'href' in prices[i]: + ic() + df = df.drop(index=[i, i + 1]) + + items = df['Price'] + for i in range(len(items)): + if not pd.isna(items[i]): + if items[i].replace('€', '') != '': + items[i] = items[i].split(' ')[0] + return df