看爬虫教程的简单练习
从起始id到15000逐条爬取知轩藏书的书籍信息(目前网站上书的id尚未超过15000),存为csv文件,可以用excel打开或者导入数据库。可以设置interval,让一次爬取的数目小一点。
代码Spider-zxcs.py
from urllib import request, parse
import time
import random
from ua_info import ua_list
import re
import csv
import os
class ZxcsSpider:
# 定义常用变量,比如url或计数变量
def init(self):
self.url = ''
self.url_review=''
self.interval=15000
# 获取响应内容函数,使用随机User-Agent
def get_html(self, url):
req = request.Request(
url=url, headers={'User-Agent': random.choice(ua_list)})
res = request.urlopen(req,timeout=5)
html = res.read().decode("utf-8")
return html
# 使用正则来解析页面,提取数据
def parse_html(self, bookid,html):
pattern = re.compile(
title = pattern.search(html)
return 【bookid,title.group(1),title.group(2),title.group(3)】
def get_review(self,bookid):
url=self.review.format(bookid,random.random());
return self.get_html(url)
# 存储提取的数据
def write2csv(self,filename,header,data):
with open(filename,'w',encoding='utf_8_sig',newline='') as f:
writer=csv.writer(f)
writer.writerow(header)
writer.writerows(data)
# 主函数
def run(self):
bookid = input('input bookid:')
#filename=input('input filename:')
startid=int(bookid)
data=【】
start=time.time()
for i in range(startid,startid+self.interval):
url=self.url.format(i)
url_review=self.url_review.format(i,random.random())
try:
html = self.get_html(url)
r_list = self.parse_html(i,html)
review=self.get_html(url_review).split(',')
r_list.append(review【0】)
r_list.append(review【4】)
except:
print(url+' 访问失败')
else:
data.append(r_list【:】)
try:
filename=bookid+'.csv'
self.write2csv(filename,self.header,data)
except Exception as e:
print('存储失败',e)
else:
print('存储成功')
end=time.time()
print('用时{}'.format(end-start))
if name == 'main':
spider = ZxcsSpider()
spider.run()
os.system()
补充:ua_info.py
ua_list = 【
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4//代码效果参考:http://www.lyjsj.net.cn/wz/art_23252.html
.0.1','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
' Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1',
' Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
】