Téléverser les fichiers vers "/"
This commit is contained in:
parent
2fcd47baa6
commit
915f0a7d12
91
gault_et_millau_scraping_v2.py
Normal file
91
gault_et_millau_scraping_v2.py
Normal file
|
@ -0,0 +1,91 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import datetime
|
||||
import pandas as pd
|
||||
from icecream import ic
|
||||
|
||||
|
||||
def get_pages(soup):
|
||||
pages = list(set([str(_).split('>')[-3][:-1].split('"')[-1] for _ in soup.find_all('option') if 'Page' in str(_)]))
|
||||
return pages
|
||||
|
||||
|
||||
def get_names(soup):
|
||||
return [_.text.strip() for _ in soup.find_all('h4')][:-2]
|
||||
|
||||
|
||||
def get_links(soup):
|
||||
return [str(_).split('href')[-1][2:].split('>')[0][:-1] for _ in soup.find_all('a') if 'restaurants/' in str(_)]
|
||||
|
||||
|
||||
def get_prices(link):
|
||||
base_url = 'https://fr.gaultmillau.com'
|
||||
times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir']
|
||||
new_page = requests.get(base_url+link)
|
||||
new_soup = BeautifulSoup(new_page.text, features='html.parser')
|
||||
menu = [_.text.strip() for _ in new_soup.find_all('td') if _.text.strip() not in times]
|
||||
prices = {}
|
||||
for i in range(0, len(menu), 2):
|
||||
prices[menu[i]] = menu[i+1].replace('.', ',')
|
||||
return prices
|
||||
|
||||
|
||||
def get_address(soup):
|
||||
times = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche', 'Matin', 'Midi', 'Soir']
|
||||
try:
|
||||
adress = [_.text.strip().split(' ')[0] + _.text.strip().split(' ')[-1] for _ in soup.find_all('b')
|
||||
if _.text not in times][0]
|
||||
return adress.split('\n')[0] + ' ' + adress.split('\n')[-1]
|
||||
except IndexError:
|
||||
return 'NA'
|
||||
|
||||
|
||||
def scrap(soup):
|
||||
names = get_names(soup)
|
||||
links = get_links(soup)
|
||||
|
||||
df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ])
|
||||
for name, link in zip(names, links):
|
||||
temporary_df = pd.DataFrame(columns=['Name', 'Siret', 'Date', 'Address', 'Item', 'Price', ])
|
||||
ic()
|
||||
ic(name)
|
||||
page = requests.get('https://fr.gaultmillau.com' + link)
|
||||
ic(page)
|
||||
soup = BeautifulSoup(page.text, features='html.parser')
|
||||
temporary_df['Item'] = get_prices(link).keys()
|
||||
temporary_df['Price'] = get_prices(link).values()
|
||||
address = (get_address(soup).replace('bis', '')
|
||||
.replace('bd', 'boulevard').replace(' av ', ' avenue '))
|
||||
ic(address)
|
||||
|
||||
for i in range(len(temporary_df)):
|
||||
temporary_df.loc[i, 'Name'] = name
|
||||
temporary_df.loc[i, 'Date'] = datetime.date.today()
|
||||
temporary_df.loc[i, 'Address'] = address
|
||||
|
||||
df = pd.concat([df, temporary_df], ignore_index=True)
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
url = 'https://fr.gaultmillau.com/fr/region/idf/restaurant#search'
|
||||
page = requests.get(url)
|
||||
soup = BeautifulSoup(page.text, features='html.parser')
|
||||
ic(soup)
|
||||
pages = get_pages(soup)
|
||||
number_of_pages = len(pages)
|
||||
df = scrap(soup)
|
||||
for i in range(1, number_of_pages):
|
||||
ic(i)
|
||||
new_url = 'https://fr.gaultmillau.com' + pages[i]
|
||||
new_page = requests.get(new_url)
|
||||
new_soup = BeautifulSoup(new_page.text, features='html.parser')
|
||||
temporary_df = scrap(new_soup)
|
||||
df = pd.concat([df, temporary_df], ignore_index=True)
|
||||
return df
|
||||
|
||||
|
||||
ic()
|
||||
df = main()
|
||||
ic()
|
||||
df.to_csv('/Users/oliviermeyer/Desktop/gault_et_milau_test.csv', index=False, header=True, escapechar='\\')
|
Loading…
Reference in New Issue
Block a user