From 5c576928da862826146b83639afde518c8c62cee Mon Sep 17 00:00:00 2001 From: Olivier MEYER Date: Tue, 25 Jun 2024 17:28:54 +0200 Subject: [PATCH] =?UTF-8?q?T=C3=A9l=C3=A9verser=20les=20fichiers=20vers=20?= =?UTF-8?q?"/"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- le_fooding_scraping.py | 204 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 le_fooding_scraping.py diff --git a/le_fooding_scraping.py b/le_fooding_scraping.py new file mode 100644 index 0000000..4cf7e55 --- /dev/null +++ b/le_fooding_scraping.py @@ -0,0 +1,204 @@ +from bs4 import BeautifulSoup +import requests +import datetime +import os +import pandas as pd +from icecream import ic + + +# Number of pages +def get_number_of_pages(soup): + pages = [_ for _ in soup.find_all()] + number_of_pages = [] + + for _ in pages: + if str(_).find('paris-8246?page=') != -1: + x = str(_).find('paris-8246?page=') + y = str(_)[x + 16: x + 18] + if y[-1].isnumeric(): + number_of_pages.append(int(y)) + ic(list(set(number_of_pages))[0]) + return list(set(number_of_pages))[0] + + +# Restaurants names +def get_names(soup): + names = [str(_)[41:-5] for _ in soup.find_all('h2')[-12:] if '') != -1: + x = str(_).find('') + address = str(_)[x - 100:x] + try: + address = address.split('>')[1:][0] + except IndexError: + pass + address = address.lstrip()[:-13] + address = ''.join(address.split('France')[:-1])[:-2] + + return address.replace(',', '') + + +# Restaurants prices of a complete page +def get_prices_and_addresses(names): + prices = [] + adresses = [] + adress = '' + soup = '' + for name in names: + if not name.isascii(): + ic() + x = 'Not ASCII' + prices.append(x) + else: + new_url = 'https://lefooding.com/restaurants/' + name.lower() + ic(new_url) + new_page = requests.get(new_url) + x = 0 + match str(new_page): + case '': + ic() + x = get_price(new_page) + soup = BeautifulSoup(new_page.text, features='html.parser') + adress = get_adress(soup) + ic(adress) + case '': + ic() + new_url = 'https://lefooding.com/restaurants/restaurant-' + name.lower() + '-paris' + new_page = requests.get(new_url) + ic(new_url) + match str(new_page): + case '': + ic() + x = get_price(new_page) + soup = BeautifulSoup(new_page.text, features='html.parser') + adress = get_adress(soup) + ic(adress) + case '': + ic() + x = '' + for i in range(1, 21): + ic() + new_url2 = new_url + '-' + str(i) + new_page = requests.get(new_url2) + if str(new_page) == '': + ic() + x = get_price(new_page) + soup = BeautifulSoup(new_page.text, features='html.parser') + adress = get_adress(soup) + ic(adress) + break + + prices.append(x) + adresses.append(adress) + + if soup != '': + newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today()) + if not os.path.exists(newpath): + os.makedirs(newpath) + + html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') + + with open(html_path, 'wt', encoding='utf-8') as html_file: + for line in soup.prettify(): + html_file.write(line) + + return prices, adresses + + +# Scrap function +def scrap_page(url): + page = requests.get(url) + soup = BeautifulSoup(page.text, features='html.parser') + + names = get_names(soup) + prices, addresses = get_prices_and_addresses(names) + ic(prices, addresses) + + df = pd.DataFrame(list(zip(names, addresses, prices)), columns=['Name', 'Address', 'Price']) + for i in range(len(df)): + df.loc[i, 'Date'] = datetime.date.today() + print(df.to_string()) + return df + + +# Main function +def main(): + ic() + # Initialization + starting_url = 'https://lefooding.com/recherche/restaurant/place/paris-8246' + page = requests.get(starting_url) + soup = BeautifulSoup(page.text, features='html.parser') + + df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',]) + number_of_pages = get_number_of_pages(soup) + + temporary_df = scrap_page(starting_url) + df = pd.concat([df, temporary_df], ignore_index=True) + print(df.to_string()) + + # Loop through the others pages + for i in range(2, number_of_pages+1): + ic(i) + new_url = starting_url + '?page=' + str(i) + temporary_df = scrap_page(new_url) + df = pd.concat([df, temporary_df], ignore_index=True) + print(df.to_string()) + + return df + + +df = main() + +df.to_csv('/Users/oliviermeyer/Desktop/le_fooding_test.csv', index=False, header=True, escapechar='\\')