diff --git a/best_restaurants_scraping_v2.py b/best_restaurants_scraping_v2.py
new file mode 100644
index 0000000..1393cb0
--- /dev/null
+++ b/best_restaurants_scraping_v2.py
@@ -0,0 +1,136 @@
+from selenium import webdriver
+import time
+from bs4 import BeautifulSoup
+import re
+import datetime
+import os
+import pandas as pd
+from icecream import ic
+
+
+def get_names(soup):
+ return [_.text for _ in soup.find_all('h4')]
+
+
+def get_addresses(soup):
+ addresses = [_.text for _ in soup.find_all('li')][9:]
+
+ addresses_v2 = [addresses[_+1:_+3] for _ in range(len(addresses)) if
+ addresses[_] == ' ' or addresses[_] == ''][:-1]
+
+ addresses_v3 = []
+ for addresse in addresses_v2:
+ if addresse[0].strip()[0].isdigit() and not addresse[0].strip()[-1].isdigit():
+ addresses_v3.append(addresse[0].strip())
+ elif addresse[-1].strip()[0].isdigit() and not addresse[-1].strip()[-1].isdigit():
+ addresses_v3.append(addresse[-1].strip())
+ else:
+ addresses_v3.append('NA')
+
+ return addresses_v3
+
+
+def get_urls(soup):
+ urls_indexes = [_.start() for _ in re.finditer('href="https://bestrestaurantsparis.com/fr/r', str(soup))]
+ return ['/'.join(str(soup)[_:_+120][6:].split('/')[:-1]) + '/' for _ in urls_indexes]
+
+
+def get_prices(url):
+ options = webdriver.ChromeOptions()
+ options.add_argument("--headless=new")
+ driver = webdriver.Chrome(options=options)
+
+ driver.get(url)
+ time.sleep(1)
+ page_source = driver.page_source
+ driver.quit()
+ soup = BeautifulSoup(page_source, features='html.parser')
+
+ prices_indexes = [_.start() for _ in re.finditer('€', str(soup))][:-2]
+
+ prices = [' '.join(str(soup)[_ - 150:_ - 1].split('label')[-1][2:].split('
')).split('<')[0]
+ except IndexError:
+ pass
+
+ prices_v2 = {}
+ for _ in prices:
+ prices_v2[' '.join(_.split(' ')[:-1])] = _.split(' ')[-1] + ' €'
+ return prices_v2, soup
+
+
+def get_number_of_pages(soup):
+ return int([_.text for _ in soup.find_all('li') if _.text.isdigit()][-1])
+
+
+def scrap(soup):
+ names = get_names(soup)
+ addresses = get_addresses(soup)
+ urls = get_urls(soup)
+
+ df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
+ for name, address, url in zip(names, addresses, urls):
+ temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ])
+ ic()
+ ic(name)
+ # ic(url)
+ temporary_df['Item'] = get_prices(url)[0].keys()
+ temporary_df['Price'] = get_prices(url)[0].values()
+
+ newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' + str(datetime.date.today())
+ if not os.path.exists(newpath):
+ os.makedirs(newpath)
+
+ html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Best Restaurants/' +
+ str(datetime.date.today()) + '/' + name + '.html')
+
+ with open(html_path, 'wt', encoding='utf-8') as html_file:
+ for line in soup.prettify():
+ html_file.write(line)
+ for i in range(len(temporary_df)):
+ temporary_df.loc[i, 'Name'] = name
+ temporary_df.loc[i, 'Address'] = address
+ temporary_df.loc[i, 'Date'] = datetime.date.today()
+
+ df = pd.concat([df, temporary_df], ignore_index=True)
+ return df
+
+
+def get_soup(url):
+ options = webdriver.ChromeOptions()
+ options.add_argument("--headless=new")
+ driver = webdriver.Chrome(options=options)
+
+ driver.get(url)
+ time.sleep(1)
+
+ page_source = driver.page_source
+ driver.quit()
+ soup = BeautifulSoup(page_source, features='html.parser')
+
+ return soup
+
+
+def main():
+ url = 'https://bestrestaurantsparis.com/fr/explore/'
+ soup = get_soup(url)
+
+ number_of_pages = get_number_of_pages(soup)
+ df = scrap(soup)
+ for i in range(2, number_of_pages+1):
+ ic(i)
+ new_url = url + '?pg=' + str(i) + '&sort=latest'
+ ic(new_url)
+ new_soup = get_soup(new_url)
+ temporary_df = scrap(new_soup)
+ df = pd.concat([df, temporary_df], ignore_index=True)
+ return df
+
+
+ic()
+df = main()
+df.to_csv('/Users/oliviermeyer/Desktop/best_restaurants_siret_test.csv', index=False, header=True, escapechar='\\')
+ic()