本期,我们实现对豆瓣图书TOP250榜单的爬取,爬取的网站如下:
我们爬取图书名称、作者、出版社、评分、评论人数等信息并保存到Excel文件中,具体代码为:
from bs4 import BeautifulSoupimport requestsimport pandas as pdimport re def get_html(url): header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'} html = requests.get(url, headers=header).content return html def get_con(html): soup = BeautifulSoup(html,'html.parser') book_list = soup.find('div', attrs={'class': 'article'}) data= [] for i in book_list.find_all('table'): #print(i) #获取书名 book_name = i.find('div', attrs={'class': 'pl2'}) book_details=i.find('p').get_text() #获取作者、出版社、出版日期、价格、评分、评论人数相关数据 author=book_details.split('/')[:-3] publisher=book_details.split('/')[-3] pb_date=book_details.split('/')[-2] price=book_details.split('/')[-1] rate=i.find('span',attrs={'class':'rating_nums'}).get_text() rate_comments=re.findall('\d+',i.find('span',attrs={'class':'pl'}).get_text())[0]+'人评论' m = list(book_name.find('a').stripped_strings) if len(m)>1: x = m[0]+m[1] else: x = m[0] data.append([x,author,publisher,pb_date,price,rate,rate_comments])# print(data) return data if __name__ == '__main__': all_list=pd.DataFrame() for i in range(0,10): url=f"https://book.douban.com/top250?start={i*25}" print(url) html= get_html(url) data= get_con(html) all_list=all_list.append(data) #print(all_list) all_list.columns=['名称','作者','出版社','出版日期','价格','豆瓣评分','评论人数'] all_list.to_excel('豆瓣图书TOP250.xlsx',index=False)
爬取效果:
OK,本期的爬虫就到这里,Bye!