Téléverser les fichiers vers "/"

2024-06-25 17:28:54 +02:00 · 2024-06-25 17:28:54 +02:00 · 5c576928da
commit 5c576928da
1 changed files with 204 additions and 0 deletions
--- a/le_fooding_scraping.py
+++ b/le_fooding_scraping.py
@ -0,0 +1,204 @@
+from bs4 import BeautifulSoup
+import requests
+import datetime
+import os
+import pandas as pd
+from icecream import ic
+
+
+# Number of pages
+def get_number_of_pages(soup):
+    pages = [_ for _ in soup.find_all()]
+    number_of_pages = []
+
+    for _ in pages:
+        if str(_).find('paris-8246?page=') != -1:
+            x = str(_).find('paris-8246?page=')
+            y = str(_)[x + 16: x + 18]
+            if y[-1].isnumeric():
+                number_of_pages.append(int(y))
+    ic(list(set(number_of_pages))[0])
+    return list(set(number_of_pages))[0]
+
+
+# Restaurants names
+def get_names(soup):
+    names = [str(_)[41:-5] for _ in soup.find_all('h2')[-12:] if '<span-class=' not in str(_)]
+    new_names = [
+        name.replace(' ', '-')
+        .replace('’', '')
+        .replace('\'', '')
+        .replace('à', 'a')
+        .replace('â', 'a')
+        .replace('ắ', 'a')
+        .replace('é', 'e')
+        .replace('è', 'e')
+        .replace('î', 'i')
+        .replace('ï', 'i')
+        .replace('ö', 'o')
+        .replace('œ', 'oe')
+        .replace('ü', 'u')
+        .replace('ç', 'c')
+        .lower()
+        for name in names
+    ]
+    return new_names
+
+
+# Le Fooding price dictionnary
+prices_dictionnary = {0: 'Moins de 15 €',
+                      1: '16 à 35 €',
+                      2: '36 à 50 €',
+                      3: 'Plus de 51 €'}
+
+
+# Single restaurant price
+def get_price(page):
+    soup = BeautifulSoup(page.text, features='html.parser')
+
+    price = set(_ for _ in soup.find_all())
+    restaurant_price = []
+
+    for _ in price:
+        p = [str(_).find('Moins de 15 €'),
+             str(_).find('De 16 à 35'),
+             str(_).find('De 36 à 50 €'),
+             str(_).find('Plus de 51')]
+
+        if sum(p) != -4:
+            if -1 in p:
+                for __ in range(len(p)):
+                    if p[__] != -1:
+                        restaurant_price.append(__)
+    return prices_dictionnary[max(set(restaurant_price), key=restaurant_price.count)]
+
+
+def get_adress(soup):
+    address = ''
+    for _ in soup:
+        if str(_).find('</a></address>') != -1:
+            x = str(_).find('</a></address>')
+            address = str(_)[x - 100:x]
+            try:
+                address = address.split('>')[1:][0]
+            except IndexError:
+                pass
+            address = address.lstrip()[:-13]
+            address = ''.join(address.split('France')[:-1])[:-2]
+
+    return address.replace(',', '')
+
+
+# Restaurants prices of a complete page
+def get_prices_and_addresses(names):
+    prices = []
+    adresses = []
+    adress = ''
+    soup = ''
+    for name in names:
+        if not name.isascii():
+            ic()
+            x = 'Not ASCII'
+            prices.append(x)
+        else:
+            new_url = 'https://lefooding.com/restaurants/' + name.lower()
+            ic(new_url)
+            new_page = requests.get(new_url)
+            x = 0
+            match str(new_page):
+                case '<Response [200]>':
+                    ic()
+                    x = get_price(new_page)
+                    soup = BeautifulSoup(new_page.text, features='html.parser')
+                    adress = get_adress(soup)
+                    ic(adress)
+                case '<Response [404]>':
+                    ic()
+                    new_url = 'https://lefooding.com/restaurants/restaurant-' + name.lower() + '-paris'
+                    new_page = requests.get(new_url)
+                    ic(new_url)
+                    match str(new_page):
+                        case '<Response [200]>':
+                            ic()
+                            x = get_price(new_page)
+                            soup = BeautifulSoup(new_page.text, features='html.parser')
+                            adress = get_adress(soup)
+                            ic(adress)
+                        case '<Response [404]>':
+                            ic()
+                            x = '<Response [404]>'
+                            for i in range(1, 21):
+                                ic()
+                                new_url2 = new_url + '-' + str(i)
+                                new_page = requests.get(new_url2)
+                                if str(new_page) == '<Response [200]>':
+                                    ic()
+                                    x = get_price(new_page)
+                                    soup = BeautifulSoup(new_page.text, features='html.parser')
+                                    adress = get_adress(soup)
+                                    ic(adress)
+                                    break
+
+        prices.append(x)
+        adresses.append(adress)
+
+        if soup != '':
+            newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + str(datetime.date.today())
+            if not os.path.exists(newpath):
+                os.makedirs(newpath)
+
+            html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
+                         str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
+
+            with open(html_path, 'wt', encoding='utf-8') as html_file:
+                for line in soup.prettify():
+                    html_file.write(line)
+
+    return prices, adresses
+
+
+# Scrap function
+def scrap_page(url):
+    page = requests.get(url)
+    soup = BeautifulSoup(page.text, features='html.parser')
+
+    names = get_names(soup)
+    prices, addresses = get_prices_and_addresses(names)
+    ic(prices, addresses)
+
+    df = pd.DataFrame(list(zip(names, addresses, prices)), columns=['Name', 'Address', 'Price'])
+    for i in range(len(df)):
+        df.loc[i, 'Date'] = datetime.date.today()
+    print(df.to_string())
+    return df
+
+
+# Main function
+def main():
+    ic()
+    # Initialization
+    starting_url = 'https://lefooding.com/recherche/restaurant/place/paris-8246'
+    page = requests.get(starting_url)
+    soup = BeautifulSoup(page.text, features='html.parser')
+
+    df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
+    number_of_pages = get_number_of_pages(soup)
+
+    temporary_df = scrap_page(starting_url)
+    df = pd.concat([df, temporary_df], ignore_index=True)
+    print(df.to_string())
+
+    # Loop through the others pages
+    for i in range(2, number_of_pages+1):
+        ic(i)
+        new_url = starting_url + '?page=' + str(i)
+        temporary_df = scrap_page(new_url)
+        df = pd.concat([df, temporary_df], ignore_index=True)
+        print(df.to_string())
+
+    return df
+
+
+df = main()
+
+df.to_csv('/Users/oliviermeyer/Desktop/le_fooding_test.csv', index=False, header=True, escapechar='\\')