Téléverser les fichiers vers "/"
This commit is contained in:
		
							parent
							
								
									7b7bf32f29
								
							
						
					
					
						commit
						2fcd47baa6
					
				
							
								
								
									
										159
									
								
								resto_de_paris_scraping.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										159
									
								
								resto_de_paris_scraping.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,159 @@ | |||
| from bs4 import BeautifulSoup | ||||
| import requests | ||||
| import datetime | ||||
| import os | ||||
| import pandas as pd | ||||
| from icecream import ic | ||||
| 
 | ||||
| 
 | ||||
| URLS = ['https://restodeparis.com/coup-de-coeur/', | ||||
|         'https://restodeparis.com/selection-michelin/', | ||||
|         'https://restodeparis.com/restaurant/gastronomique/', | ||||
|         'https://restodeparis.com/restaurant/bistronomique/', | ||||
|         'https://restodeparis.com/restaurant/francais/', | ||||
|         'https://restodeparis.com/restaurant/cuisine-du-monde/', | ||||
|         'https://restodeparis.com/coup-de-coeur/'] | ||||
| 
 | ||||
| 
 | ||||
| def get_names(soup): | ||||
|     names = [_ for _ in soup.find_all('a')] | ||||
| 
 | ||||
|     names_v2 = [] | ||||
|     for i in range(len(names)): | ||||
|         if str(names[i]).find('target="_self" title=') != -1: | ||||
|             names_v2.append(names[i]) | ||||
| 
 | ||||
|     names_v3 = [str(names_v2[_]) for _ in range(-11, 0, 2)] | ||||
| 
 | ||||
|     return [name[:-4].split('>')[-1] for name in names_v3] | ||||
| 
 | ||||
| 
 | ||||
| def get_all_names(urls): | ||||
|     names = [] | ||||
|     for url in urls: | ||||
|         page = requests.get(url) | ||||
|         soup = BeautifulSoup(page.text, features='html.parser') | ||||
|         names += get_names(soup) | ||||
| 
 | ||||
|     names = [name.replace(' ', '-').replace('’', '').replace('\'', '').replace('à', 'a').replace('â', 'a') | ||||
|              .replace('ắ', 'a') | ||||
|              .replace('é', 'e') | ||||
|              .replace('è', 'e') | ||||
|              .replace('î', 'i') | ||||
|              .replace('ï', 'i') | ||||
|              .replace('ö', 'o') | ||||
|              .replace('œ', 'oe') | ||||
|              .replace('ü', 'u') | ||||
|              .replace('ç', 'c') | ||||
|              .replace('---', '-') | ||||
|              .lower() | ||||
|              for name in names] | ||||
| 
 | ||||
|     return sorted(list(set(names))) | ||||
| 
 | ||||
| 
 | ||||
| def get_restaurants_url(names): | ||||
|     restaurants_url = [] | ||||
|     for name in names: | ||||
|         if name == 'cafe-sud': | ||||
|             url = 'https://restodeparis.com/restaurant/' + 'la-regalade-saint-honore/' | ||||
|         else: | ||||
|             url = 'https://restodeparis.com/restaurant/' + name + '/' | ||||
|         restaurants_url.append(url) | ||||
|     return restaurants_url | ||||
| 
 | ||||
| 
 | ||||
| def get_address(soup): | ||||
|     span = [_.text for _ in soup.find_all('span')] | ||||
|     adress = '' | ||||
|     for _ in range(len(span)): | ||||
|         if span[_].find('Y Aller !') != -1: | ||||
|             adress = span[_ + 1][1:-4] | ||||
|     return adress.replace(',', '') | ||||
| 
 | ||||
| 
 | ||||
| def get_menu(soup): | ||||
|     prices = [str(_.text) for _ in soup.find_all('span')] | ||||
|     prices = prices[:-23] | ||||
| 
 | ||||
|     for price in prices: | ||||
|         if len(price) == 0 or price == '\n' + '\n': | ||||
|             prices.remove(price) | ||||
| 
 | ||||
|     prices_v2 = [] | ||||
|     for price in prices: | ||||
|         try: | ||||
|             if price[0] == '\n': | ||||
|                 prices_v2.append(price[1:price.find('\t')]) | ||||
|             else: | ||||
|                 prices_v2.append(price) | ||||
|         except IndexError: | ||||
|             pass | ||||
| 
 | ||||
|     index_la_carte = 0 | ||||
|     for i in range(len(prices_v2)): | ||||
|         if prices_v2[i] == 'La Carte': | ||||
|             index_la_carte = i+1 | ||||
|     prices_v2 = prices_v2[index_la_carte:] | ||||
| 
 | ||||
|     for i in range(1, len(prices_v2)-1): | ||||
|         if prices_v2[-i] == 'Le Restaurant': | ||||
|             prices_v2 = prices_v2[:-i] | ||||
| 
 | ||||
|     index_last_price = 0 | ||||
|     for i in range(len(prices_v2)): | ||||
|         ic(prices_v2[i]) | ||||
|         if '€' in prices_v2[i]: | ||||
|             index_last_price = i+1 | ||||
|     if index_last_price != 0: | ||||
|         prices_v2 = prices_v2[:index_last_price] | ||||
| 
 | ||||
|     prices_v2 = [price.replace('.', ',') for price in prices_v2] | ||||
| 
 | ||||
|     prices_v3 = {} | ||||
|     for i in range(0, len(prices_v2), 2): | ||||
|         prices_v3[prices_v2[i]] = prices_v2[i+1] | ||||
| 
 | ||||
|     return prices_v3 | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     names = get_all_names(URLS) | ||||
|     restaurants_url = get_restaurants_url(names) | ||||
|     df = pd.DataFrame(columns=['Name', 'Date', 'Adress', 'Item', 'Price', ]) | ||||
| 
 | ||||
|     for name, url in zip(names, restaurants_url): | ||||
|         temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ]) | ||||
|         ic(name) | ||||
|         page = requests.get(url) | ||||
|         soup = BeautifulSoup(page.text, features='html.parser') | ||||
|         temporary_df['Item'] = get_menu(soup).keys() | ||||
|         temporary_df['Price'] = get_menu(soup).values() | ||||
|         address = get_address(soup) | ||||
|         ic(address) | ||||
| 
 | ||||
|         for i in range(len(temporary_df)): | ||||
|             temporary_df.loc[i, 'Name'] = name | ||||
|             temporary_df.loc[i, 'Date'] = datetime.date.today() | ||||
|             temporary_df.loc[i, 'Address'] = address | ||||
| 
 | ||||
|         df = pd.concat([df, temporary_df], ignore_index=True) | ||||
| 
 | ||||
|         newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today()) | ||||
|         if not os.path.exists(newpath): | ||||
|             os.makedirs(newpath) | ||||
| 
 | ||||
|         html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + | ||||
|                      str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') | ||||
| 
 | ||||
|         with open(html_path, 'wt', encoding='utf-8') as html_file: | ||||
|             for line in soup.prettify(): | ||||
|                 html_file.write(line) | ||||
| 
 | ||||
|     return df | ||||
| 
 | ||||
| 
 | ||||
| ic() | ||||
| df = main() | ||||
| df.to_csv('/Users/oliviermeyer/Desktop/resto_de_paris_test.csv', index=False, header=True, escapechar='\\') | ||||
| ic() | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user