Actualiser petit_fute_scraping.py

This commit is contained in:
Olivier MEYER 2024-06-27 11:04:42 +02:00
parent c9ad358993
commit 2c93071a4f

View File

@ -1,6 +1,7 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import requests import requests
import datetime import datetime
import os
import pandas as pd import pandas as pd
from icecream import ic from icecream import ic
@ -39,12 +40,11 @@ def get_url(soup):
def get_addresses(soup): def get_addresses(soup):
urls = get_url(soup) urls = get_url(soup)
ic(len(urls))
addresses = [] addresses = []
for url in urls: for url in urls:
ic()
ic(url) ic(url)
page = requests.get(url) page = requests.get(url)
ic(page)
soup2 = BeautifulSoup(page.text, features='html.parser') soup2 = BeautifulSoup(page.text, features='html.parser')
all_span = [_.text for _ in soup2.find_all('span')] all_span = [_.text for _ in soup2.find_all('span')]
@ -75,16 +75,28 @@ def get_addresses(soup):
def scrap(soup): def scrap(soup):
df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Price',]) df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
df['Name'] = get_names(soup) df['Name'] = get_names(soup)
df['Date'] = datetime.date.today() df['Date'] = datetime.date.today()
df['Address'] = get_addresses(soup) df['Address'] = get_addresses(soup)
df['Price'] = get_prices(soup) df['Price'] = get_prices(soup)
for name in df['Name']:
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today())
if not os.path.exists(newpath):
os.makedirs(newpath)
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
with open(html_path, 'wt', encoding='utf-8') as html_file:
for line in soup.prettify():
html_file.write(line)
return df return df
def main(): def complete_scraping():
url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/' url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/'
page = requests.get(url) page = requests.get(url)
soup = BeautifulSoup(page.text, features='html.parser') soup = BeautifulSoup(page.text, features='html.parser')
@ -105,7 +117,3 @@ def main():
return df return df
ic()
main().to_csv('/Users/oliviermeyer/Desktop/petit_fute_siret_test.csv', index=False, header=True, escapechar='\\')
ic()