diff --git a/petit_fute_scraping_v2.py b/petit_fute_scraping_v2.py new file mode 100644 index 0000000..a9b9de7 --- /dev/null +++ b/petit_fute_scraping_v2.py @@ -0,0 +1,111 @@ +from bs4 import BeautifulSoup +import requests +import datetime +import pandas as pd +from icecream import ic + + +def get_names(soup): + return [_.text.strip() for _ in soup.find_all('h3')] + + +def get_number_of_pages(soup): + return int([_.text for _ in soup.find_all('ul')][13][-3:-1]) + + +def get_prices(soup): + prices = [_.text.strip() for _ in soup.find_all('div') if ' • €' in _.text] + prices_v2 = [] + for price in prices: + x = price.find(' • €') + prices_v2.append(price[x+3:x+8].strip()) + return [prices_v2[_] for _ in range(0, len(prices_v2[6:])+7, 6)][1:] + + +def get_url(soup): + links = [str(_) for _ in soup.find_all('a') if '.html' in str(_) and 'https:' not in str(_)][1:-1] + links_v2 = [] + for link in links: + x = link.find('.html')+5 + links_v2.append(link[:x].split('href')[-1][3:]) + + links_v3 = [] + for link in links_v2: + if 'https://www.petitfute.com/' + link not in links_v3: + links_v3.append('https://www.petitfute.com/' + link) + + return links_v3 + + +def get_addresses(soup): + urls = get_url(soup) + ic(len(urls)) + addresses = [] + for url in urls: + ic(url) + page = requests.get(url) + ic(page) + soup2 = BeautifulSoup(page.text, features='html.parser') + + all_span = [_.text for _ in soup2.find_all('span')] + address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]). + replace(',', ' ').replace(' ', ' ')) + + # Deal with word before the street number + if not address.split(' ')[0].isdigit(): + address_piece_by_piece = address.split(' ') + for i in range(len(address_piece_by_piece)): + try: + if address_piece_by_piece[i][0].isdigit(): + address = ' '.join(address_piece_by_piece[i:]) + break + except IndexError: + pass + + # Deal with double street numbers (keep only the first number) + if '-' in address[:5]: + address = address.split('-')[0] + ' ' + ' '.join(address.split(' ')[1:]) + + # Deal with abbreviation + address = address.replace('Av.', 'avenue').replace('bis ', '') + + ic(address) + addresses.append(address) + return addresses + + +def scrap(soup): + df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Price',]) + df['Name'] = get_names(soup) + df['Date'] = datetime.date.today() + df['Address'] = get_addresses(soup) + df['Price'] = get_prices(soup) + + return df + + +def main(): + url = 'https://www.petitfute.com/d3-paris/c1165-restaurants/' + page = requests.get(url) + soup = BeautifulSoup(page.text, features='html.parser') + + df = scrap(soup) + + number_of_pages = get_number_of_pages(soup) + for i in range(number_of_pages+1): + try: + ic(page) + new_url = url + '?page=' + str(i) + new_page = requests.get(new_url) + new_soup = BeautifulSoup(new_page.text, features='html.parser') + temporary_df = scrap(new_soup) + df = pd.concat([df, temporary_df], ignore_index=True) + except ValueError: + pass + + return df + + +ic() +main().to_csv('/Users/oliviermeyer/Desktop/petit_fute_siret_test.csv', index=False, header=True, escapechar='\\') +ic()