diff --git a/petit_fute_scraping.py b/petit_fute_scraping.py index a9b9de7..46e5f80 100644 --- a/petit_fute_scraping.py +++ b/petit_fute_scraping.py @@ -1,6 +1,7 @@ from bs4 import BeautifulSoup import requests import datetime +import os import pandas as pd from icecream import ic @@ -39,12 +40,11 @@ def get_url(soup): def get_addresses(soup): urls = get_url(soup) - ic(len(urls)) addresses = [] for url in urls: + ic() ic(url) page = requests.get(url) - ic(page) soup2 = BeautifulSoup(page.text, features='html.parser') all_span = [_.text for _ in soup2.find_all('span')] @@ -75,16 +75,28 @@ def get_addresses(soup): def scrap(soup): - df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Price',]) + df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',]) df['Name'] = get_names(soup) df['Date'] = datetime.date.today() df['Address'] = get_addresses(soup) df['Price'] = get_prices(soup) + for name in df['Name']: + newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today()) + if not os.path.exists(newpath): + os.makedirs(newpath) + + html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') + + with open(html_path, 'wt', encoding='utf-8') as html_file: + for line in soup.prettify(): + html_file.write(line) + return df -def main(): +def complete_scraping(): url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/' page = requests.get(url) soup = BeautifulSoup(page.text, features='html.parser') @@ -105,7 +117,3 @@ def main(): return df - -ic() -main().to_csv('/Users/oliviermeyer/Desktop/petit_fute_siret_test.csv', index=False, header=True, escapechar='\\') -ic()