diff --git a/petit_fute_scraping.py b/petit_fute_scraping.py index 2b771a6..9907280 100644 --- a/petit_fute_scraping.py +++ b/petit_fute_scraping.py @@ -7,7 +7,8 @@ from icecream import ic def get_names(soup): - return [_.text.strip() for _ in soup.find_all('h3')] + # ic(soup) + return [_.text.strip() for _ in soup.find_all('h2') if 'manger à' not in _.text][:-1] def get_number_of_pages(soup): @@ -42,13 +43,16 @@ def get_addresses(soup): urls = get_url(soup) addresses = [] for url in urls: - ic() + # ic() page = requests.get(url) soup2 = BeautifulSoup(page.text, features='html.parser') all_span = [_.text for _ in soup2.find_all('span')] - address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]). - replace(',', ' ').replace(' ', ' ')) + try: + address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]). + replace(',', ' ').replace(' ', ' ')) + except ValueError: + address = '' # Deal with word before the street number if not address.split(' ')[0].isdigit(): @@ -68,30 +72,45 @@ def get_addresses(soup): # Deal with abbreviation address = address.replace('Av.', 'avenue').replace('bis ', '') - ic(address) + # ic(address) addresses.append(address) return addresses def scrap(soup): + ic() df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',]) df['Name'] = get_names(soup) + # ic(len(get_names(soup))) df['Date'] = datetime.date.today() - df['Address'] = get_addresses(soup) - df['Price'] = get_prices(soup) + try: + df['Address'] = get_addresses(soup) + # ic(len(get_addresses(soup))) + except ValueError: + ic(get_names(soup)) + ic(get_addresses(soup)) + try: + df['Price'] = get_prices(soup) + # ic(len(get_prices(soup))) + except ValueError: + ic(get_names(soup)) + ic(get_prices(soup)) for name in df['Name']: newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today()) if not os.path.exists(newpath): os.makedirs(newpath) - html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + - str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') - - with open(html_path, 'wt', encoding='utf-8') as html_file: - for line in soup.prettify(): - html_file.write(line) + try: + html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + + str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') + with open(html_path, 'wt', encoding='utf-8') as html_file: + for line in soup.prettify(): + html_file.write(line) + except AttributeError: + ic() + # print(df) return df @@ -104,15 +123,22 @@ def complete_scraping(): number_of_pages = get_number_of_pages(soup) for i in range(number_of_pages+1): - try: - new_url = url + '?page=' + str(i) - new_page = requests.get(new_url) - new_soup = BeautifulSoup(new_page.text, features='html.parser') - temporary_df = scrap(new_soup) - df = pd.concat([df, temporary_df], ignore_index=True) - except ValueError: - pass + # try: + new_url = url + '?page=' + str(i) + new_page = requests.get(new_url) + new_soup = BeautifulSoup(new_page.text, features='html.parser') + temporary_df = scrap(new_soup) + df = pd.concat([df, temporary_df], ignore_index=True) + # print(df.to_string()) + # except ValueError: + # pass df = df.dropna(subset='Address') - return df + prices = df['Price'] + for i in range(len(prices)): + if not pd.isna(prices[i]): + if prices[i].replace('€', '') != '': + prices[i] = prices[i].split(' ')[0] + + return df