From 2fcd47baa6065baa6179f24925f978457beb8526 Mon Sep 17 00:00:00 2001 From: Olivier MEYER Date: Tue, 25 Jun 2024 17:30:41 +0200 Subject: [PATCH] =?UTF-8?q?T=C3=A9l=C3=A9verser=20les=20fichiers=20vers=20?= =?UTF-8?q?"/"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- resto_de_paris_scraping.py | 159 +++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 resto_de_paris_scraping.py diff --git a/resto_de_paris_scraping.py b/resto_de_paris_scraping.py new file mode 100644 index 0000000..e96d655 --- /dev/null +++ b/resto_de_paris_scraping.py @@ -0,0 +1,159 @@ +from bs4 import BeautifulSoup +import requests +import datetime +import os +import pandas as pd +from icecream import ic + + +URLS = ['https://restodeparis.com/coup-de-coeur/', + 'https://restodeparis.com/selection-michelin/', + 'https://restodeparis.com/restaurant/gastronomique/', + 'https://restodeparis.com/restaurant/bistronomique/', + 'https://restodeparis.com/restaurant/francais/', + 'https://restodeparis.com/restaurant/cuisine-du-monde/', + 'https://restodeparis.com/coup-de-coeur/'] + + +def get_names(soup): + names = [_ for _ in soup.find_all('a')] + + names_v2 = [] + for i in range(len(names)): + if str(names[i]).find('target="_self" title=') != -1: + names_v2.append(names[i]) + + names_v3 = [str(names_v2[_]) for _ in range(-11, 0, 2)] + + return [name[:-4].split('>')[-1] for name in names_v3] + + +def get_all_names(urls): + names = [] + for url in urls: + page = requests.get(url) + soup = BeautifulSoup(page.text, features='html.parser') + names += get_names(soup) + + names = [name.replace(' ', '-').replace('’', '').replace('\'', '').replace('à', 'a').replace('â', 'a') + .replace('ắ', 'a') + .replace('é', 'e') + .replace('è', 'e') + .replace('î', 'i') + .replace('ï', 'i') + .replace('ö', 'o') + .replace('œ', 'oe') + .replace('ü', 'u') + .replace('ç', 'c') + .replace('---', '-') + .lower() + for name in names] + + return sorted(list(set(names))) + + +def get_restaurants_url(names): + restaurants_url = [] + for name in names: + if name == 'cafe-sud': + url = 'https://restodeparis.com/restaurant/' + 'la-regalade-saint-honore/' + else: + url = 'https://restodeparis.com/restaurant/' + name + '/' + restaurants_url.append(url) + return restaurants_url + + +def get_address(soup): + span = [_.text for _ in soup.find_all('span')] + adress = '' + for _ in range(len(span)): + if span[_].find('Y Aller !') != -1: + adress = span[_ + 1][1:-4] + return adress.replace(',', '') + + +def get_menu(soup): + prices = [str(_.text) for _ in soup.find_all('span')] + prices = prices[:-23] + + for price in prices: + if len(price) == 0 or price == '\n' + '\n': + prices.remove(price) + + prices_v2 = [] + for price in prices: + try: + if price[0] == '\n': + prices_v2.append(price[1:price.find('\t')]) + else: + prices_v2.append(price) + except IndexError: + pass + + index_la_carte = 0 + for i in range(len(prices_v2)): + if prices_v2[i] == 'La Carte': + index_la_carte = i+1 + prices_v2 = prices_v2[index_la_carte:] + + for i in range(1, len(prices_v2)-1): + if prices_v2[-i] == 'Le Restaurant': + prices_v2 = prices_v2[:-i] + + index_last_price = 0 + for i in range(len(prices_v2)): + ic(prices_v2[i]) + if '€' in prices_v2[i]: + index_last_price = i+1 + if index_last_price != 0: + prices_v2 = prices_v2[:index_last_price] + + prices_v2 = [price.replace('.', ',') for price in prices_v2] + + prices_v3 = {} + for i in range(0, len(prices_v2), 2): + prices_v3[prices_v2[i]] = prices_v2[i+1] + + return prices_v3 + + +def main(): + names = get_all_names(URLS) + restaurants_url = get_restaurants_url(names) + df = pd.DataFrame(columns=['Name', 'Date', 'Adress', 'Item', 'Price', ]) + + for name, url in zip(names, restaurants_url): + temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ]) + ic(name) + page = requests.get(url) + soup = BeautifulSoup(page.text, features='html.parser') + temporary_df['Item'] = get_menu(soup).keys() + temporary_df['Price'] = get_menu(soup).values() + address = get_address(soup) + ic(address) + + for i in range(len(temporary_df)): + temporary_df.loc[i, 'Name'] = name + temporary_df.loc[i, 'Date'] = datetime.date.today() + temporary_df.loc[i, 'Address'] = address + + df = pd.concat([df, temporary_df], ignore_index=True) + + newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today()) + if not os.path.exists(newpath): + os.makedirs(newpath) + + html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') + + with open(html_path, 'wt', encoding='utf-8') as html_file: + for line in soup.prettify(): + html_file.write(line) + + return df + + +ic() +df = main() +df.to_csv('/Users/oliviermeyer/Desktop/resto_de_paris_test.csv', index=False, header=True, escapechar='\\') +ic()