版权声明:本文可能为博主原创文章,若标明出处可随便转载。 https://blog.csdn.net/Jailman/article/details/77673417
freebuf爬虫
#C:\Python27\python.exe
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import re
import os
import urllib
import requests
from multiprocessing import Pool
subject_dict = {u'漏洞':'http://www.freebuf.com/vuls', u'安全工具':'http://www.freebuf.com/sectool',
u'WEB安全':'http://www.freebuf.com/articles/web', u'系统安全':'http://www.freebuf.com/articles/system',
u'网络安全':'http://www.freebuf.com/articles/network', u'无线安全':'http://www.freebuf.com/articles/wireless',
u'终端安全':'http://www.freebuf.com/articles/terminal', u'数据安全':'http://www.freebuf.com/articles/database',
u'安全管理':'http://www.freebuf.com/articles/security-management', u'企业安全':'http://www.freebuf.com/articles/es',
u'极客':'http://www.freebuf.com/geek'}
def spider(filename, url):
print "Crawling subject: %s" % filename
if os.path.isfile(filename + ".html"):
os.remove(filename + ".html")
with open(filename + ".html",'a') as f:
page = 0
error_couter = 0
while True:
page += 1
try:
html = requests.get(url + '/page/' + str(page))
code = html.status_code
if code == 404:
error_couter += 1
if error_couter == 1:
print "Subject %s may only have %s pages." % (filename, str(page - 1))
if error_couter <= 3:
print "Retrying %s: 404 not Found!" % str(error_couter)
continue
else:
print "Subject %s finished!" % filename
print "#################################"
break
else:
print u"Parsing page: " + str(page)
if page == 1:
site = re.findall('([\s\S]*) </div>\n <div class="news-more" id="pagination">',html.text,re.S)
else:
site = re.findall('<div id="timeline" class="news-detial">([\s\S]*?) </div>\n <div class="news-more" id="pagination">',html.text,re.S)
for each in site:
f.write(urllib.unquote(each.encode('utf-8')))
except Exception as e:
print e
pass
f.close()
def main():
for key,value in subject_dict.items():
spider(key, value)
# pool = Pool(processes=4)
# for i in range(0, subject_dict.__len__()):
# arg_list = subject_dict.items()[i]
# pool.apply_async(spider, (arg_list[0], arg_list[1],)).get(timeout=None)
# pool.close()
# pool.join()
if __name__ == '__main__':
main()