Téléverser les fichiers vers "/"

This commit is contained in:
Olivier MEYER 2024-06-25 17:34:46 +02:00
parent 915f0a7d12
commit 76c5b520de

111
petit_fute_scraping_v2.py Normal file
View File

@ -0,0 +1,111 @@
from bs4 import BeautifulSoup
import requests
import datetime
import pandas as pd
from icecream import ic
def get_names(soup):
return [_.text.strip() for _ in soup.find_all('h3')]
def get_number_of_pages(soup):
return int([_.text for _ in soup.find_all('ul')][13][-3:-1])
def get_prices(soup):
prices = [_.text.strip() for _ in soup.find_all('div') if ' • €' in _.text]
prices_v2 = []
for price in prices:
x = price.find(' • €')
prices_v2.append(price[x+3:x+8].strip())
return [prices_v2[_] for _ in range(0, len(prices_v2[6:])+7, 6)][1:]
def get_url(soup):
links = [str(_) for _ in soup.find_all('a') if '.html' in str(_) and 'https:' not in str(_)][1:-1]
links_v2 = []
for link in links:
x = link.find('.html')+5
links_v2.append(link[:x].split('href')[-1][3:])
links_v3 = []
for link in links_v2:
if 'https://www.petitfute.com/' + link not in links_v3:
links_v3.append('https://www.petitfute.com/' + link)
return links_v3
def get_addresses(soup):
urls = get_url(soup)
ic(len(urls))
addresses = []
for url in urls:
ic(url)
page = requests.get(url)
ic(page)
soup2 = BeautifulSoup(page.text, features='html.parser')
all_span = [_.text for _ in soup2.find_all('span')]
address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]).
replace(',', ' ').replace(' ', ' '))
# Deal with word before the street number
if not address.split(' ')[0].isdigit():
address_piece_by_piece = address.split(' ')
for i in range(len(address_piece_by_piece)):
try:
if address_piece_by_piece[i][0].isdigit():
address = ' '.join(address_piece_by_piece[i:])
break
except IndexError:
pass
# Deal with double street numbers (keep only the first number)
if '-' in address[:5]:
address = address.split('-')[0] + ' ' + ' '.join(address.split(' ')[1:])
# Deal with abbreviation
address = address.replace('Av.', 'avenue').replace('bis ', '')
ic(address)
addresses.append(address)
return addresses
def scrap(soup):
df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Price',])
df['Name'] = get_names(soup)
df['Date'] = datetime.date.today()
df['Address'] = get_addresses(soup)
df['Price'] = get_prices(soup)
return df
def main():
url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/'
page = requests.get(url)
soup = BeautifulSoup(page.text, features='html.parser')
df = scrap(soup)
number_of_pages = get_number_of_pages(soup)
for i in range(number_of_pages+1):
try:
ic(page)
new_url = url + '?page=' + str(i)
new_page = requests.get(new_url)
new_soup = BeautifulSoup(new_page.text, features='html.parser')
temporary_df = scrap(new_soup)
df = pd.concat([df, temporary_df], ignore_index=True)
except ValueError:
pass
return df
ic()
main().to_csv('/Users/oliviermeyer/Desktop/petit_fute_siret_test.csv', index=False, header=True, escapechar='\\')
ic()