Actualiser petit_fute_scraping.py
This commit is contained in:
parent
c9ad358993
commit
2c93071a4f
|
@ -1,6 +1,7 @@
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import requests
|
import requests
|
||||||
import datetime
|
import datetime
|
||||||
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from icecream import ic
|
from icecream import ic
|
||||||
|
|
||||||
|
@ -39,12 +40,11 @@ def get_url(soup):
|
||||||
|
|
||||||
def get_addresses(soup):
|
def get_addresses(soup):
|
||||||
urls = get_url(soup)
|
urls = get_url(soup)
|
||||||
ic(len(urls))
|
|
||||||
addresses = []
|
addresses = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
|
ic()
|
||||||
ic(url)
|
ic(url)
|
||||||
page = requests.get(url)
|
page = requests.get(url)
|
||||||
ic(page)
|
|
||||||
soup2 = BeautifulSoup(page.text, features='html.parser')
|
soup2 = BeautifulSoup(page.text, features='html.parser')
|
||||||
|
|
||||||
all_span = [_.text for _ in soup2.find_all('span')]
|
all_span = [_.text for _ in soup2.find_all('span')]
|
||||||
|
@ -75,16 +75,28 @@ def get_addresses(soup):
|
||||||
|
|
||||||
|
|
||||||
def scrap(soup):
|
def scrap(soup):
|
||||||
df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Price',])
|
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
|
||||||
df['Name'] = get_names(soup)
|
df['Name'] = get_names(soup)
|
||||||
df['Date'] = datetime.date.today()
|
df['Date'] = datetime.date.today()
|
||||||
df['Address'] = get_addresses(soup)
|
df['Address'] = get_addresses(soup)
|
||||||
df['Price'] = get_prices(soup)
|
df['Price'] = get_prices(soup)
|
||||||
|
|
||||||
|
for name in df['Name']:
|
||||||
|
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today())
|
||||||
|
if not os.path.exists(newpath):
|
||||||
|
os.makedirs(newpath)
|
||||||
|
|
||||||
|
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
|
||||||
|
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
|
||||||
|
|
||||||
|
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
||||||
|
for line in soup.prettify():
|
||||||
|
html_file.write(line)
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def complete_scraping():
|
||||||
url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/'
|
url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/'
|
||||||
page = requests.get(url)
|
page = requests.get(url)
|
||||||
soup = BeautifulSoup(page.text, features='html.parser')
|
soup = BeautifulSoup(page.text, features='html.parser')
|
||||||
|
@ -105,7 +117,3 @@ def main():
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
ic()
|
|
||||||
main().to_csv('/Users/oliviermeyer/Desktop/petit_fute_siret_test.csv', index=False, header=True, escapechar='\\')
|
|
||||||
ic()
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user