Actualiser petit_fute_scraping.py

2024-06-27 11:04:42 +02:00 · 2024-06-27 11:04:42 +02:00 · 2c93071a4f
commit 2c93071a4f
parent c9ad358993
1 changed files with 16 additions and 8 deletions
--- a/petit_fute_scraping.py
+++ b/petit_fute_scraping.py
@ -1,6 +1,7 @@
 from bs4 import BeautifulSoup
 import requests
 import datetime
 import os
 import pandas as pd
 from icecream import ic
@ -39,12 +40,11 @@ def get_url(soup):
 def get_addresses(soup):
    urls = get_url(soup)
    ic(len(urls))
    addresses = []
    for url in urls:
        ic()
        ic(url)
        page = requests.get(url)
        ic(page)
        soup2 = BeautifulSoup(page.text, features='html.parser')
        all_span = [_.text for _ in soup2.find_all('span')]
@ -75,16 +75,28 @@ def get_addresses(soup):
 def scrap(soup):
-    df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Price',])
+    df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
    df['Name'] = get_names(soup)
    df['Date'] = datetime.date.today()
    df['Address'] = get_addresses(soup)
    df['Price'] = get_prices(soup)
    for name in df['Name']:
        newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today())
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
                     str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
        with open(html_path, 'wt', encoding='utf-8') as html_file:
            for line in soup.prettify():
                html_file.write(line)
    return df
-def main():
+def complete_scraping():
    url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, features='html.parser')
@ -105,7 +117,3 @@ def main():
    return df
 ic()
 main().to_csv('/Users/oliviermeyer/Desktop/petit_fute_siret_test.csv', index=False, header=True, escapechar='\\')
 ic()