#!C:\Python27\python.exe #coding=utf8 import os import pdfkit import urllib2 from bs4 import BeautifulSoup from multiprocessing import Pool import socket socket.setdefaulttimeout(60) import sys reload(sys) sys.setdefaultencoding('utf-8') def url_open(url): user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36' headers = {'User-Agent': user_agent} request = urllib2.Request(url=url, headers=headers) try: page = urllib2.urlopen(request, timeout=60) except urllib2.HTTPError as e: return 1 contents = page.read() # print contents soup = BeautifulSoup(contents.decode('utf-8','ignore'), "lxml") return soup def retrieve_pdf(dir, link): savedStderr = sys.stderr with open('errlog.txt', 'w+') as file: sys.stderr = file try: pdfkit.from_url(link, dir) pass finally: pass sys.stderr = savedStderr def strip_char(string): char = ['*', '/', '\\', ':', '"', '?', '<', '>', '|'] processed = [] for i in string: if i not in char: processed.append(i) return ''.join(processed) def crawler(root, url, num): # print url if url_open(url) != 1: soup = url_open(url) # print soup for tr in soup.find_all("tr"): # print tr td = tr.find_all('td') if list(td) == None: continue if len(td) > 0: if td[0].get_text() == u"提交时间": continue date = td[0].get_text() title = td[1].get_text() dir = title + '.pdf' type = td[2].get_text() poster = td[3].get_text() print date + " " + title + " " + type + " " + poster link = root + '.'.join(tr.get('onclick').split('\'')[1].split('.')[1:]) print link print "Retrieving PDF..." print dir dir = strip_char(dir).encode('utf-8').decode('utf-8') temp_name = 'temp' + str(num) + '.pdf' try: retrieve_pdf(temp_name, link) except Exception: if os.path.exists(temp_name): print "Retrieved Successfully!" os.rename(temp_name, dir) else: print 'Retrieve failed!' continue def single_func(num): root = 'http://cb.drops.wiki' url = "http://cb.drops.wiki/search.php?kind=drops&keywords=&page=" + str(num) crawler(root, url, num) if __name__ == '__main__': # single_func(1) #func test # for page in range(1, 86): # single_func(page) pool = Pool(processes=4) for i in range(1, 86): result = pool.apply_async(single_func, (i,)) pool.close() pool.join()