prix_restos/le_fooding_scraping.py

from bs4 import BeautifulSoup
import requests
import datetime
import os
import pandas as pd
from icecream import ic


# Number of pages
def get_number_of_pages(soup):
    pages = [_ for _ in soup.find_all()]
    number_of_pages = []

    for _ in pages:
        if str(_).find('paris-8246?page=') != -1:
            x = str(_).find('paris-8246?page=')
            y = str(_)[x + 16: x + 18]
            if y[-1].isnumeric():
                number_of_pages.append(int(y))
    ic(list(set(number_of_pages))[0])
    return list(set(number_of_pages))[0]


# Restaurants names
def get_names(soup):
    names = [str(_)[41:-5] for _ in soup.find_all('h2')[-12:] if '<span-class=' not in str(_)]
    new_names = [
        name.replace(' ', '-')
        .replace('’', '')
        .replace('\'', '')
        .replace('à', 'a')
        .replace('â', 'a')
        .replace('ắ', 'a')
        .replace('é', 'e')
        .replace('è', 'e')
        .replace('î', 'i')
        .replace('ï', 'i')
        .replace('ö', 'o')
        .replace('œ', 'oe')
        .replace('ü', 'u')
        .replace('ç', 'c')
        .lower()
        for name in names
    ]
    return new_names


# Le Fooding price dictionnary
prices_dictionnary = {0: 'Moins de 15 €',
                      1: '16 à 35 €',
                      2: '36 à 50 €',
                      3: 'Plus de 51 €'}


# Single restaurant price
def get_price(page):
    soup = BeautifulSoup(page.text, features='html.parser')

    price = set(_ for _ in soup.find_all())
    restaurant_price = []

    for _ in price:
        p = [str(_).find('Moins de 15 €'),
             str(_).find('De 16 à 35'),
             str(_).find('De 36 à 50 €'),
             str(_).find('Plus de 51')]

        if sum(p) != -4:
            if -1 in p:
                for __ in range(len(p)):
                    if p[__] != -1:
                        restaurant_price.append(__)
    return prices_dictionnary[max(set(restaurant_price), key=restaurant_price.count)]


def get_adress(soup):
    address = ''
    for _ in soup:
        if str(_).find('</a></address>') != -1:
            x = str(_).find('</a></address>')
            address = str(_)[x - 100:x]
            try:
                address = address.split('>')[1:][0]
            except IndexError:
                pass
            address = address.lstrip()[:-13]
            address = ''.join(address.split('France')[:-1])[:-2]

    return address.replace(',', '')


# Restaurants prices of a complete page
def get_prices_and_addresses(names):
    prices = []
    adresses = []
    adress = ''
    soup = ''
    for name in names:
        ic(name)
        if not name.isascii():
            ic()
            x = 'Not ASCII'
            prices.append(x)
        else:
            new_url = 'https://lefooding.com/restaurants/' + name.lower()
            new_page = requests.get(new_url)
            x = 0
            match str(new_page):
                case '<Response [200]>':
                    ic()
                    x = get_price(new_page)
                    soup = BeautifulSoup(new_page.text, features='html.parser')
                    adress = get_adress(soup)
                case '<Response [404]>':
                    ic()
                    new_url = 'https://lefooding.com/restaurants/restaurant-' + name.lower() + '-paris'
                    new_page = requests.get(new_url)
                    match str(new_page):
                        case '<Response [200]>':
                            ic()
                            x = get_price(new_page)
                            soup = BeautifulSoup(new_page.text, features='html.parser')
                            adress = get_adress(soup)
                        case '<Response [404]>':
                            ic()
                            x = '<Response [404]>'
                            for i in range(1, 21):
                                new_url2 = new_url + '-' + str(i)
                                new_page = requests.get(new_url2)
                                if str(new_page) == '<Response [200]>':
                                    x = get_price(new_page)
                                    soup = BeautifulSoup(new_page.text, features='html.parser')
                                    adress = get_adress(soup)
                                    break

        prices.append(x)
        adresses.append(adress)

        if soup != '':
            newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today())
            if not os.path.exists(newpath):
                os.makedirs(newpath)

            html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
                         str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')

            with open(html_path, 'wt', encoding='utf-8') as html_file:
                for line in soup.prettify():
                    html_file.write(line)

    return prices, adresses


# Scrap function
def scrap_page(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, features='html.parser')

    names = get_names(soup)
    prices, addresses = get_prices_and_addresses(names)

    df = pd.DataFrame(list(zip(names, addresses, prices)), columns=['Name', 'Address', 'Price'])
    for i in range(len(df)):
        df.loc[i, 'Date'] = datetime.date.today()
    return df


# Main function
def complete_scraping():
    ic()
    # Initialization
    starting_url = 'https://lefooding.com/recherche/restaurant/place/paris-8246'
    page = requests.get(starting_url)
    soup = BeautifulSoup(page.text, features='html.parser')

    df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
    number_of_pages = get_number_of_pages(soup)

    temporary_df = scrap_page(starting_url)
    df = pd.concat([df, temporary_df], ignore_index=True)

    # Loop through the others pages
    for i in range(2, number_of_pages+1):
        ic(i)
        new_url = starting_url + '?page=' + str(i)
        temporary_df = scrap_page(new_url)
        df = pd.concat([df, temporary_df], ignore_index=True)

    df = df[~df['Name'].str.contains('style="display')]
    df = df.dropna(subset='Address')

    return df