Téléverser les fichiers vers "/"
This commit is contained in:
		
							parent
							
								
									7b7bf32f29
								
							
						
					
					
						commit
						2fcd47baa6
					
				
							
								
								
									
										159
									
								
								resto_de_paris_scraping.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										159
									
								
								resto_de_paris_scraping.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,159 @@ | ||||||
|  | from bs4 import BeautifulSoup | ||||||
|  | import requests | ||||||
|  | import datetime | ||||||
|  | import os | ||||||
|  | import pandas as pd | ||||||
|  | from icecream import ic | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | URLS = ['https://restodeparis.com/coup-de-coeur/', | ||||||
|  |         'https://restodeparis.com/selection-michelin/', | ||||||
|  |         'https://restodeparis.com/restaurant/gastronomique/', | ||||||
|  |         'https://restodeparis.com/restaurant/bistronomique/', | ||||||
|  |         'https://restodeparis.com/restaurant/francais/', | ||||||
|  |         'https://restodeparis.com/restaurant/cuisine-du-monde/', | ||||||
|  |         'https://restodeparis.com/coup-de-coeur/'] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_names(soup): | ||||||
|  |     names = [_ for _ in soup.find_all('a')] | ||||||
|  | 
 | ||||||
|  |     names_v2 = [] | ||||||
|  |     for i in range(len(names)): | ||||||
|  |         if str(names[i]).find('target="_self" title=') != -1: | ||||||
|  |             names_v2.append(names[i]) | ||||||
|  | 
 | ||||||
|  |     names_v3 = [str(names_v2[_]) for _ in range(-11, 0, 2)] | ||||||
|  | 
 | ||||||
|  |     return [name[:-4].split('>')[-1] for name in names_v3] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_all_names(urls): | ||||||
|  |     names = [] | ||||||
|  |     for url in urls: | ||||||
|  |         page = requests.get(url) | ||||||
|  |         soup = BeautifulSoup(page.text, features='html.parser') | ||||||
|  |         names += get_names(soup) | ||||||
|  | 
 | ||||||
|  |     names = [name.replace(' ', '-').replace('’', '').replace('\'', '').replace('à', 'a').replace('â', 'a') | ||||||
|  |              .replace('ắ', 'a') | ||||||
|  |              .replace('é', 'e') | ||||||
|  |              .replace('è', 'e') | ||||||
|  |              .replace('î', 'i') | ||||||
|  |              .replace('ï', 'i') | ||||||
|  |              .replace('ö', 'o') | ||||||
|  |              .replace('œ', 'oe') | ||||||
|  |              .replace('ü', 'u') | ||||||
|  |              .replace('ç', 'c') | ||||||
|  |              .replace('---', '-') | ||||||
|  |              .lower() | ||||||
|  |              for name in names] | ||||||
|  | 
 | ||||||
|  |     return sorted(list(set(names))) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_restaurants_url(names): | ||||||
|  |     restaurants_url = [] | ||||||
|  |     for name in names: | ||||||
|  |         if name == 'cafe-sud': | ||||||
|  |             url = 'https://restodeparis.com/restaurant/' + 'la-regalade-saint-honore/' | ||||||
|  |         else: | ||||||
|  |             url = 'https://restodeparis.com/restaurant/' + name + '/' | ||||||
|  |         restaurants_url.append(url) | ||||||
|  |     return restaurants_url | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_address(soup): | ||||||
|  |     span = [_.text for _ in soup.find_all('span')] | ||||||
|  |     adress = '' | ||||||
|  |     for _ in range(len(span)): | ||||||
|  |         if span[_].find('Y Aller !') != -1: | ||||||
|  |             adress = span[_ + 1][1:-4] | ||||||
|  |     return adress.replace(',', '') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_menu(soup): | ||||||
|  |     prices = [str(_.text) for _ in soup.find_all('span')] | ||||||
|  |     prices = prices[:-23] | ||||||
|  | 
 | ||||||
|  |     for price in prices: | ||||||
|  |         if len(price) == 0 or price == '\n' + '\n': | ||||||
|  |             prices.remove(price) | ||||||
|  | 
 | ||||||
|  |     prices_v2 = [] | ||||||
|  |     for price in prices: | ||||||
|  |         try: | ||||||
|  |             if price[0] == '\n': | ||||||
|  |                 prices_v2.append(price[1:price.find('\t')]) | ||||||
|  |             else: | ||||||
|  |                 prices_v2.append(price) | ||||||
|  |         except IndexError: | ||||||
|  |             pass | ||||||
|  | 
 | ||||||
|  |     index_la_carte = 0 | ||||||
|  |     for i in range(len(prices_v2)): | ||||||
|  |         if prices_v2[i] == 'La Carte': | ||||||
|  |             index_la_carte = i+1 | ||||||
|  |     prices_v2 = prices_v2[index_la_carte:] | ||||||
|  | 
 | ||||||
|  |     for i in range(1, len(prices_v2)-1): | ||||||
|  |         if prices_v2[-i] == 'Le Restaurant': | ||||||
|  |             prices_v2 = prices_v2[:-i] | ||||||
|  | 
 | ||||||
|  |     index_last_price = 0 | ||||||
|  |     for i in range(len(prices_v2)): | ||||||
|  |         ic(prices_v2[i]) | ||||||
|  |         if '€' in prices_v2[i]: | ||||||
|  |             index_last_price = i+1 | ||||||
|  |     if index_last_price != 0: | ||||||
|  |         prices_v2 = prices_v2[:index_last_price] | ||||||
|  | 
 | ||||||
|  |     prices_v2 = [price.replace('.', ',') for price in prices_v2] | ||||||
|  | 
 | ||||||
|  |     prices_v3 = {} | ||||||
|  |     for i in range(0, len(prices_v2), 2): | ||||||
|  |         prices_v3[prices_v2[i]] = prices_v2[i+1] | ||||||
|  | 
 | ||||||
|  |     return prices_v3 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def main(): | ||||||
|  |     names = get_all_names(URLS) | ||||||
|  |     restaurants_url = get_restaurants_url(names) | ||||||
|  |     df = pd.DataFrame(columns=['Name', 'Date', 'Adress', 'Item', 'Price', ]) | ||||||
|  | 
 | ||||||
|  |     for name, url in zip(names, restaurants_url): | ||||||
|  |         temporary_df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Item', 'Price', ]) | ||||||
|  |         ic(name) | ||||||
|  |         page = requests.get(url) | ||||||
|  |         soup = BeautifulSoup(page.text, features='html.parser') | ||||||
|  |         temporary_df['Item'] = get_menu(soup).keys() | ||||||
|  |         temporary_df['Price'] = get_menu(soup).values() | ||||||
|  |         address = get_address(soup) | ||||||
|  |         ic(address) | ||||||
|  | 
 | ||||||
|  |         for i in range(len(temporary_df)): | ||||||
|  |             temporary_df.loc[i, 'Name'] = name | ||||||
|  |             temporary_df.loc[i, 'Date'] = datetime.date.today() | ||||||
|  |             temporary_df.loc[i, 'Address'] = address | ||||||
|  | 
 | ||||||
|  |         df = pd.concat([df, temporary_df], ignore_index=True) | ||||||
|  | 
 | ||||||
|  |         newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + str(datetime.date.today()) | ||||||
|  |         if not os.path.exists(newpath): | ||||||
|  |             os.makedirs(newpath) | ||||||
|  | 
 | ||||||
|  |         html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Resto_de_Paris/' + | ||||||
|  |                      str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') | ||||||
|  | 
 | ||||||
|  |         with open(html_path, 'wt', encoding='utf-8') as html_file: | ||||||
|  |             for line in soup.prettify(): | ||||||
|  |                 html_file.write(line) | ||||||
|  | 
 | ||||||
|  |     return df | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | ic() | ||||||
|  | df = main() | ||||||
|  | df.to_csv('/Users/oliviermeyer/Desktop/resto_de_paris_test.csv', index=False, header=True, escapechar='\\') | ||||||
|  | ic() | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user