prix_restos/resto_de_paris_scraping.py

from bs4 import BeautifulSoup
import requests
import datetime
import os
import pandas as pd
from icecream import ic


URLS = ['https://restodeparis.com/coup-de-coeur/',
        'https://restodeparis.com/selection-michelin/',
        'https://restodeparis.com/restaurant/gastronomique/',
        'https://restodeparis.com/restaurant/bistronomique/',
        'https://restodeparis.com/restaurant/francais/',
        'https://restodeparis.com/restaurant/cuisine-du-monde/',
        'https://restodeparis.com/coup-de-coeur/']


def get_names(soup):
    names = [_ for _ in soup.find_all('a')]

    names_v2 = []
    for i in range(len(names)):
        if str(names[i]).find('target="_self" title=') != -1:
            names_v2.append(names[i])

    names_v3 = [str(names_v2[_]) for _ in range(-11, 0, 2)]

    return [name[:-4].split('>')[-1] for name in names_v3]


def get_all_names(urls):
    names = []
    for url in urls:
        page = requests.get(url)
        soup = BeautifulSoup(page.text, features='html.parser')
        names += get_names(soup)

    names = [name.replace(' ', '-').replace('’', '').replace('\'', '').replace('à', 'a').replace('â', 'a')
             .replace('ắ', 'a')
             .replace('é', 'e')
             .replace('è', 'e')
             .replace('î', 'i')
             .replace('ï', 'i')
             .replace('ö', 'o')
             .replace('œ', 'oe')
             .replace('ü', 'u')
             .replace('ç', 'c')
             .replace('---', '-')
             .lower()
             for name in names]

    return sorted(list(set(names)))


def get_restaurants_url(names):
    restaurants_url = []
    for name in names:
        if name == 'cafe-sud':
            url = 'https://restodeparis.com/restaurant/' + 'la-regalade-saint-honore/'
        else:
            url = 'https://restodeparis.com/restaurant/' + name + '/'
        restaurants_url.append(url)
    return restaurants_url


def get_address(soup):
    span = [_.text for _ in soup.find_all('span')]
    adress = ''
    for _ in range(len(span)):
        if span[_].find('Y Aller !') != -1:
            adress = span[_ + 1][1:-4]
    return adress.replace(',', '')


def get_menu(soup):
    prices = [str(_.text) for _ in soup.find_all('span')]
    prices = prices[:-23]

    for price in prices:
        if len(price) == 0 or price == '\n' + '\n':
            prices.remove(price)

    prices_v2 = []
    for price in prices:
        try:
            if price[0] == '\n':
                prices_v2.append(price[1:price.find('\t')])
            else:
                prices_v2.append(price)
        except IndexError:
            pass

    index_la_carte = 0
    for i in range(len(prices_v2)):
        if prices_v2[i] == 'La Carte':
            index_la_carte = i+1
    prices_v2 = prices_v2[index_la_carte:]

    for i in range(1, len(prices_v2)-1):
        if prices_v2[-i] == 'Le Restaurant':
            prices_v2 = prices_v2[:-i]

    index_last_price = 0
    for i in range(len(prices_v2)):
        if '€' in prices_v2[i]:
            index_last_price = i+1
    if index_last_price != 0:
        prices_v2 = prices_v2[:index_last_price]

    prices_v2 = [price.replace('.', ',') for price in prices_v2]

    prices_v3 = {}
    for i in range(0, len(prices_v2), 2):
        prices_v3[prices_v2[i]] = prices_v2[i+1]

    return prices_v3


def complete_scraping():
    names = get_all_names(URLS)
    restaurants_url = get_restaurants_url(names)
    df = pd.DataFrame(columns=['Name', 'Date', 'Adress', 'Item', 'Price', ])

    for name, url in zip(names, restaurants_url):
        temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
        ic()
        ic(name)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, features='html.parser')
        temporary_df['Item'] = get_menu(soup).keys()
        temporary_df['Price'] = get_menu(soup).values()
        address = get_address(soup)

        for i in range(len(temporary_df)):
            temporary_df.loc[i, 'Name'] = name
            temporary_df.loc[i, 'Date'] = datetime.date.today()
            temporary_df.loc[i, 'Address'] = address

        df = pd.concat([df, temporary_df], ignore_index=True)

        newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today())
        if not os.path.exists(newpath):
            os.makedirs(newpath)

        html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' +
                     str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')

        with open(html_path, 'wt', encoding='utf-8') as html_file:
            for line in soup.prettify():
                html_file.write(line)

    df = df.dropna(subset='Address')
    return df