Actualiser petit_fute_scraping.py
This commit is contained in:
parent
b0a9202d8c
commit
07e68fed34
|
@ -7,7 +7,8 @@ from icecream import ic
|
|||
|
||||
|
||||
def get_names(soup):
|
||||
return [_.text.strip() for _ in soup.find_all('h3')]
|
||||
# ic(soup)
|
||||
return [_.text.strip() for _ in soup.find_all('h2') if 'manger à' not in _.text][:-1]
|
||||
|
||||
|
||||
def get_number_of_pages(soup):
|
||||
|
@ -42,13 +43,16 @@ def get_addresses(soup):
|
|||
urls = get_url(soup)
|
||||
addresses = []
|
||||
for url in urls:
|
||||
ic()
|
||||
# ic()
|
||||
page = requests.get(url)
|
||||
soup2 = BeautifulSoup(page.text, features='html.parser')
|
||||
|
||||
all_span = [_.text for _ in soup2.find_all('span')]
|
||||
try:
|
||||
address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]).
|
||||
replace(',', ' ').replace(' ', ' '))
|
||||
except ValueError:
|
||||
address = ''
|
||||
|
||||
# Deal with word before the street number
|
||||
if not address.split(' ')[0].isdigit():
|
||||
|
@ -68,30 +72,45 @@ def get_addresses(soup):
|
|||
# Deal with abbreviation
|
||||
address = address.replace('Av.', 'avenue').replace('bis ', '')
|
||||
|
||||
ic(address)
|
||||
# ic(address)
|
||||
addresses.append(address)
|
||||
return addresses
|
||||
|
||||
|
||||
def scrap(soup):
|
||||
ic()
|
||||
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
|
||||
df['Name'] = get_names(soup)
|
||||
# ic(len(get_names(soup)))
|
||||
df['Date'] = datetime.date.today()
|
||||
try:
|
||||
df['Address'] = get_addresses(soup)
|
||||
# ic(len(get_addresses(soup)))
|
||||
except ValueError:
|
||||
ic(get_names(soup))
|
||||
ic(get_addresses(soup))
|
||||
try:
|
||||
df['Price'] = get_prices(soup)
|
||||
# ic(len(get_prices(soup)))
|
||||
except ValueError:
|
||||
ic(get_names(soup))
|
||||
ic(get_prices(soup))
|
||||
|
||||
for name in df['Name']:
|
||||
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today())
|
||||
if not os.path.exists(newpath):
|
||||
os.makedirs(newpath)
|
||||
|
||||
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' +
|
||||
try:
|
||||
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' +
|
||||
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
|
||||
|
||||
with open(html_path, 'wt', encoding='utf-8') as html_file:
|
||||
for line in soup.prettify():
|
||||
html_file.write(line)
|
||||
|
||||
except AttributeError:
|
||||
ic()
|
||||
# print(df)
|
||||
return df
|
||||
|
||||
|
||||
|
@ -104,15 +123,22 @@ def complete_scraping():
|
|||
|
||||
number_of_pages = get_number_of_pages(soup)
|
||||
for i in range(number_of_pages+1):
|
||||
try:
|
||||
# try:
|
||||
new_url = url + '?page=' + str(i)
|
||||
new_page = requests.get(new_url)
|
||||
new_soup = BeautifulSoup(new_page.text, features='html.parser')
|
||||
temporary_df = scrap(new_soup)
|
||||
df = pd.concat([df, temporary_df], ignore_index=True)
|
||||
except ValueError:
|
||||
pass
|
||||
# print(df.to_string())
|
||||
# except ValueError:
|
||||
# pass
|
||||
|
||||
df = df.dropna(subset='Address')
|
||||
return df
|
||||
|
||||
prices = df['Price']
|
||||
for i in range(len(prices)):
|
||||
if not pd.isna(prices[i]):
|
||||
if prices[i].replace('€', '') != '':
|
||||
prices[i] = prices[i].split(' ')[0]
|
||||
|
||||
return df
|
||||
|
|
Loading…
Reference in New Issue
Block a user