From 2fcd47baa6065baa6179f24925f978457beb8526 Mon Sep 17 00:00:00 2001
From: Olivier MEYER <olivier.meyer@eleve.ensai.fr>
Date: Tue, 25 Jun 2024 17:30:41 +0200
Subject: [PATCH] =?UTF-8?q?T=C3=A9l=C3=A9verser=20les=20fichiers=20vers=20?=
 =?UTF-8?q?"/"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 resto_de_paris_scraping.py | 159 +++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 resto_de_paris_scraping.py

diff --git a/resto_de_paris_scraping.py b/resto_de_paris_scraping.py
new file mode 100644
index 0000000..e96d655
--- /dev/null
+++ b/resto_de_paris_scraping.py
@@ -0,0 +1,159 @@
+from bs4 import BeautifulSoup
+import requests
+import datetime
+import os
+import pandas as pd
+from icecream import ic
+
+
+URLS = ['https://restodeparis.com/coup-de-coeur/',
+        'https://restodeparis.com/selection-michelin/',
+        'https://restodeparis.com/restaurant/gastronomique/',
+        'https://restodeparis.com/restaurant/bistronomique/',
+        'https://restodeparis.com/restaurant/francais/',
+        'https://restodeparis.com/restaurant/cuisine-du-monde/',
+        'https://restodeparis.com/coup-de-coeur/']
+
+
+def get_names(soup):
+    names = [_ for _ in soup.find_all('a')]
+
+    names_v2 = []
+    for i in range(len(names)):
+        if str(names[i]).find('target="_self" title=') != -1:
+            names_v2.append(names[i])
+
+    names_v3 = [str(names_v2[_]) for _ in range(-11, 0, 2)]
+
+    return [name[:-4].split('>')[-1] for name in names_v3]
+
+
+def get_all_names(urls):
+    names = []
+    for url in urls:
+        page = requests.get(url)
+        soup = BeautifulSoup(page.text, features='html.parser')
+        names += get_names(soup)
+
+    names = [name.replace(' ', '-').replace('’', '').replace('\'', '').replace('à', 'a').replace('â', 'a')
+             .replace('ắ', 'a')
+             .replace('é', 'e')
+             .replace('è', 'e')
+             .replace('î', 'i')
+             .replace('ï', 'i')
+             .replace('ö', 'o')
+             .replace('œ', 'oe')
+             .replace('ü', 'u')
+             .replace('ç', 'c')
+             .replace('---', '-')
+             .lower()
+             for name in names]
+
+    return sorted(list(set(names)))
+
+
+def get_restaurants_url(names):
+    restaurants_url = []
+    for name in names:
+        if name == 'cafe-sud':
+            url = 'https://restodeparis.com/restaurant/' + 'la-regalade-saint-honore/'
+        else:
+            url = 'https://restodeparis.com/restaurant/' + name + '/'
+        restaurants_url.append(url)
+    return restaurants_url
+
+
+def get_address(soup):
+    span = [_.text for _ in soup.find_all('span')]
+    adress = ''
+    for _ in range(len(span)):
+        if span[_].find('Y Aller !') != -1:
+            adress = span[_ + 1][1:-4]
+    return adress.replace(',', '')
+
+
+def get_menu(soup):
+    prices = [str(_.text) for _ in soup.find_all('span')]
+    prices = prices[:-23]
+
+    for price in prices:
+        if len(price) == 0 or price == '\n' + '\n':
+            prices.remove(price)
+
+    prices_v2 = []
+    for price in prices:
+        try:
+            if price[0] == '\n':
+                prices_v2.append(price[1:price.find('\t')])
+            else:
+                prices_v2.append(price)
+        except IndexError:
+            pass
+
+    index_la_carte = 0
+    for i in range(len(prices_v2)):
+        if prices_v2[i] == 'La Carte':
+            index_la_carte = i+1
+    prices_v2 = prices_v2[index_la_carte:]
+
+    for i in range(1, len(prices_v2)-1):
+        if prices_v2[-i] == 'Le Restaurant':
+            prices_v2 = prices_v2[:-i]
+
+    index_last_price = 0
+    for i in range(len(prices_v2)):
+        ic(prices_v2[i])
+        if '€' in prices_v2[i]:
+            index_last_price = i+1
+    if index_last_price != 0:
+        prices_v2 = prices_v2[:index_last_price]
+
+    prices_v2 = [price.replace('.', ',') for price in prices_v2]
+
+    prices_v3 = {}
+    for i in range(0, len(prices_v2), 2):
+        prices_v3[prices_v2[i]] = prices_v2[i+1]
+
+    return prices_v3
+
+
+def main():
+    names = get_all_names(URLS)
+    restaurants_url = get_restaurants_url(names)
+    df = pd.DataFrame(columns=['Name', 'Date', 'Adress', 'Item', 'Price', ])
+
+    for name, url in zip(names, restaurants_url):
+        temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
+        ic(name)
+        page = requests.get(url)
+        soup = BeautifulSoup(page.text, features='html.parser')
+        temporary_df['Item'] = get_menu(soup).keys()
+        temporary_df['Price'] = get_menu(soup).values()
+        address = get_address(soup)
+        ic(address)
+
+        for i in range(len(temporary_df)):
+            temporary_df.loc[i, 'Name'] = name
+            temporary_df.loc[i, 'Date'] = datetime.date.today()
+            temporary_df.loc[i, 'Address'] = address
+
+        df = pd.concat([df, temporary_df], ignore_index=True)
+
+        newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today())
+        if not os.path.exists(newpath):
+            os.makedirs(newpath)
+
+        html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' +
+                     str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
+
+        with open(html_path, 'wt', encoding='utf-8') as html_file:
+            for line in soup.prettify():
+                html_file.write(line)
+
+    return df
+
+
+ic()
+df = main()
+df.to_csv('/Users/oliviermeyer/Desktop/resto_de_paris_test.csv', index=False, header=True, escapechar='\\')
+ic()