Actualiser petit_fute_scraping.py

This commit is contained in:
Olivier MEYER 2024-07-15 14:43:18 +02:00
parent b0a9202d8c
commit 07e68fed34

View File

@ -7,7 +7,8 @@ from icecream import ic
def get_names(soup): def get_names(soup):
return [_.text.strip() for _ in soup.find_all('h3')] # ic(soup)
return [_.text.strip() for _ in soup.find_all('h2') if 'manger à' not in _.text][:-1]
def get_number_of_pages(soup): def get_number_of_pages(soup):
@ -42,13 +43,16 @@ def get_addresses(soup):
urls = get_url(soup) urls = get_url(soup)
addresses = [] addresses = []
for url in urls: for url in urls:
ic() # ic()
page = requests.get(url) page = requests.get(url)
soup2 = BeautifulSoup(page.text, features='html.parser') soup2 = BeautifulSoup(page.text, features='html.parser')
all_span = [_.text for _ in soup2.find_all('span')] all_span = [_.text for _ in soup2.find_all('span')]
address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]). try:
replace(',', ' ').replace(' ', ' ')) address = (('' + all_span[all_span.index('Paris,')-2] + all_span[all_span.index('Paris,')-1]).
replace(',', ' ').replace(' ', ' '))
except ValueError:
address = ''
# Deal with word before the street number # Deal with word before the street number
if not address.split(' ')[0].isdigit(): if not address.split(' ')[0].isdigit():
@ -68,30 +72,45 @@ def get_addresses(soup):
# Deal with abbreviation # Deal with abbreviation
address = address.replace('Av.', 'avenue').replace('bis ', '') address = address.replace('Av.', 'avenue').replace('bis ', '')
ic(address) # ic(address)
addresses.append(address) addresses.append(address)
return addresses return addresses
def scrap(soup): def scrap(soup):
ic()
df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',]) df = pd.DataFrame(columns=['Name', 'Date', 'Address', 'Price',])
df['Name'] = get_names(soup) df['Name'] = get_names(soup)
# ic(len(get_names(soup)))
df['Date'] = datetime.date.today() df['Date'] = datetime.date.today()
df['Address'] = get_addresses(soup) try:
df['Price'] = get_prices(soup) df['Address'] = get_addresses(soup)
# ic(len(get_addresses(soup)))
except ValueError:
ic(get_names(soup))
ic(get_addresses(soup))
try:
df['Price'] = get_prices(soup)
# ic(len(get_prices(soup)))
except ValueError:
ic(get_names(soup))
ic(get_prices(soup))
for name in df['Name']: for name in df['Name']:
newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today()) newpath = '/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' + str(datetime.date.today())
if not os.path.exists(newpath): if not os.path.exists(newpath):
os.makedirs(newpath) os.makedirs(newpath)
html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Le Fooding/' + try:
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html') html_path = ('/Users/oliviermeyer/Desktop/CREST/Web Scraping/HTML/Petit Fute/' +
str(datetime.date.today()) + '/' + name.replace('/', '-') + '.html')
with open(html_path, 'wt', encoding='utf-8') as html_file:
for line in soup.prettify():
html_file.write(line)
with open(html_path, 'wt', encoding='utf-8') as html_file:
for line in soup.prettify():
html_file.write(line)
except AttributeError:
ic()
# print(df)
return df return df
@ -104,15 +123,22 @@ def complete_scraping():
number_of_pages = get_number_of_pages(soup) number_of_pages = get_number_of_pages(soup)
for i in range(number_of_pages+1): for i in range(number_of_pages+1):
try: # try:
new_url = url + '?page=' + str(i) new_url = url + '?page=' + str(i)
new_page = requests.get(new_url) new_page = requests.get(new_url)
new_soup = BeautifulSoup(new_page.text, features='html.parser') new_soup = BeautifulSoup(new_page.text, features='html.parser')
temporary_df = scrap(new_soup) temporary_df = scrap(new_soup)
df = pd.concat([df, temporary_df], ignore_index=True) df = pd.concat([df, temporary_df], ignore_index=True)
except ValueError: # print(df.to_string())
pass # except ValueError:
# pass
df = df.dropna(subset='Address') df = df.dropna(subset='Address')
return df
prices = df['Price']
for i in range(len(prices)):
if not pd.isna(prices[i]):
if prices[i].replace('', '') != '':
prices[i] = prices[i].split(' ')[0]
return df