RT请问问题出在哪里?为了安全起见,我把用户名密码改掉了。
#-*- coding:GB18030 -*-
import sys
import os
import re
import urllib.request
import http.cookiejar
import threading
urllogin = 'http://bbs.artx.cn/logging.php?action=login&loginsubmit=yes&inajax=1'
cj = http.cookiejar.CookieJar()
#建立新的opener
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
#装载新的opener
urllib.request.install_opener(opener)
#盛放PostData的字典
postDict = {
'formhash' : '00e0e70f',
'referer' : 'http%3A%2F2Fguji.artx.cn%2F',
'loginfield' : 'username',
'username' : '********',
'password' : '********',
'questionid' : '0',
'answer' : '',
'cookietime' : '2592000',
}
#将字典中的PostData编译成url格式,再转码为UTF-8
postData = urllib.parse.urlencode(postDict).encode('utf-8')
#带着PostData访问登录页面
resp = urllib.request.urlopen(urllogin, postData)
html = resp.read()
resp2 = urllib.request.urlopen('http://guji.artx.cn/', postData)
def main():
chooseop = input('请选择操作:\n1.解析单本书籍介绍页面\n2.解析书籍专题页面\n3.退出程序\n')
if chooseop == '1':
processurl(input('请输入要抓取的文章主页面的地址:\n'),1)
elif chooseop == '2':
processsub(input('请输入要抓取的专题页面的地址:\n'))
elif chooseop == '3':
sys.exit()
#处理书籍介绍页
def processurl(url,type):
response = urllib.request.urlopen(url)
html = response.read()
#将html解码出来
uhtml = html.decode('utf-8')
#截取所有章节页面的URL
urls = re.findall('(?<=<li><a href=\").*\.html(?=\">)',uhtml)
#截取所有章节标题
titles = re.findall('(?<=\.html\">).*(?=</a></li>)',uhtml)
#将章节标题里的空格占位符换为空格
for i in titles:
i = i.replace(' ',' ')
i = i.replace('(','(')
i = i.replace(')',')')
#截取文章总标题
titleinlist = re.findall('(?<=title"><h3>).*(?=</h3></div>)',uhtml)
#截取文章所属的库
kuinlist = re.findall('(?<=\.html>).库(?=\</a> )',uhtml)
#截取专题名
kindinlist = re.findall('(?<=showmain_kind_z>).*?(?=</a>)', uhtml)
kind = kindinlist[0]
ku = kuinlist[0]
title = titleinlist[0]
if len(urls) == len(titles):
processurl2(url, '简介', title, ku, kind)
if len(urls) < 5:
for i in range(len(urls)):
processurl2("http://guji.artx.cn" + urls[i], titles[i], title, ku, kind)
if type == 1:
main()
else:
t1 = ''
t2 = ''
t3 = ''
t4 = ''
num = len(urls)
every = num // 4
mod = num % 4
#执行分段,分段
urlsplit1 = urls[0:every]
urlsplit2 = urls[every:every*2]
urlsplit3 = urls[every*2:every*3]
urlsplit4 = urls[every*3:every*4+mod]
titlesplit1 = titles[0:every]
titlesplit2 = titles[every:every*2]
titlesplit3 = titles[every*2:every*3]
titlesplit4 = titles[every*3:every*4+mod]
print ("解析出的链接数和章节数相等,匹配正确!\n")
thread1 = Thread(1, 1, urlsplit1, titlesplit1, title, ku, kind)
thread2 = Thread(2, 2, urlsplit2, titlesplit2, title, ku, kind)
thread3 = Thread(3, 3, urlsplit3, titlesplit3, title, ku, kind)
thread4 = Thread(4, 4, urlsplit4, titlesplit4, title, ku, kind)
thread1.start()
thread2.start()
thread3.start()
thread4.start()
if type == 1:
main()
else:
print ("解析出的章节数和链接数不相等,可能存在错误!\n")
#如果是抓取单本书,返回主操作,否则跳过
#处理文本
def text(i):
#大致截取出正文文本
text1 = re.findall('(?<=font-size:14px;\">).*?(?=</div>)',i,re.DOTALL)
#删除文本中的阅读笔记代码
garbages1 = re.findall('<font class=bj_style>.*?</a></font>',text1[0],re.DOTALL)
for g1 in garbages1:
text1[0] = text1[0].replace(g1,'\n ')
#删除文本中的‘中国古籍全录’代码
garbages2 = re.findall('<a href=.http.*?</a>',text1[0],re.DOTALL)
for g2 in garbages2:
text1[0] = text1[0].replace(g2,'')
#删除文本中的<font class=***>
garbages3 = re.findall('<font class=.*?>',text1[0],re.DOTALL)
for g3 in garbages3:
text1[0] = text1[0].replace(g3,'')
#删除文本中的注释
garbages4 = re.findall('<a href=.*?</a>',text1[0],re.DOTALL)
for g4 in garbages4:
text1[0] = text1[0].replace(g4,'')
#删除文本中的</strong>
text1[0] = text1[0].replace('</strong>','')
#删除文本中的<strong>
text1[0] = text1[0].replace('<strong>','')
#删除文本中剩余的</font>
text1[0] = text1[0].replace('</font>','')
#删除文本中剩余的<br>
text1[0] = text1[0].replace("<br>","")
#删除文本中的空格占位符
text1[0] = text1[0].replace(" ","")
#把文本中的?替换为问号
text1[0] = text1[0].replace("?","?")
#把文本中的"替换为双引号
text1[0] = text1[0].replace(""","\"")
return text1[0]
#处理专题
def processsub(url):
response = urllib.request.urlopen(url)
html = response.read()
#将html解码出来
uhtml = html.decode('utf-8')
urls = re.findall('(?<=<a href=\").*?html(?=\" title=)',uhtml)
titles = re.findall('(?<=\.html\" title=\").*?(?=\" target=_blank>)',uhtml,re.DOTALL)
numt = len(titles)
if numt == len(urls):
print ('解析出的书籍数与链接数相等,匹配正确!\n')
#删除书名中的乱码
for i in titles:
i = i.replace(' ',' ')
i = i.replace('(','(')
i = i.replace(')',')')
subinlist = re.findall('(?<=html">).{2,10}(?=</a></div>)',uhtml)
print ('您要下载的专题是:\n',subinlist[0],'\n其中的书籍有:\n',titles)
global thread1
global thread2
global thread3
global thread4
for i in urls:
do = processurl(i,2)
#while thread1.isAlive == False and thread2.isAlive == False and thread3.isAlive == False and thread4.isAlive == False:
#continue
else:
print ('解析出的书籍数和链接数不相等,可能存在错误!\n')
#多线程对象
class Thread(threading.Thread):
def __init__(self, num, interval, urlsplit, titlesplit, title, ku, kind):
threading.Thread.__init__(self)
self.thread_num = num
self.interval = interval
self.thread_stop = False
self.urlsplit = urlsplit
self.titlesplit = titlesplit
self.title = title
self.ku = ku
self.kind = kind
#多线程重复调用processurl2
def run(self):
while self.thread_stop == False:
for i in range(len(self.urlsplit)):
url1 = self.urlsplit[i]
title1 = self.titlesplit[i]
processurl2("http://guji.artx.cn" + url1, title1, self.title, self.ku, self.kind)
self.stop()
def stop(self):
self.thread_stop = True
#处理子页面,urls是url,titles是章节名,title是书的总标题,ku书籍所属库,kind是专题名
def processurl2(urls, titles, title, ku, kind):
#try:
response1 = urllib.request.urlopen(urls)
html1 = response1.read()
uhtml1 = html1.decode('utf-8')
#判断以库名和书名命名的文件夹是否存在,若不存在则创建
if os.path.exists('E:/downloadedbooks/' + ku + '/' + kind + '/' + title) == False:
os.makedirs('E:/downloadedbooks/' + ku + '/' + kind + '/' + title)
else:
pass
#获取文章内容
article = text(uhtml1)
#在目录下以书名为文件名,以GB18030为默认编码创建TXT并写入内容
f = open('E:/downloadedbooks/' + ku + '/' + kind + '/' + title + '/' + titles + '.txt','w',encoding='GB18030')
f.write(str(article))
f.close()
print (titles, '.........下载完成.')
#except:
#print('本章出现异常,请手工处理!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
main()
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。
用pyquery,对html操作的时候更合适的是jquery的方式而不是用regex。
才写的一个爬虫,抓业务数据的,其中一段:
def parseUpperSixteen(self, pqobj):
basicTable = pqobj(r'td > table > tr:eq(1) td table')
# 第一行
fieldpattern = r'table > tr:eq({0})'.format(0)
field = basicTable(fieldpattern)
self.people.jobSituation = field(r'td select:eq(0) :selected').text()
self.people.jobSituationDetail = field(r'td select input').val() if field(r'td select input').val() is not None else ""
self.people.jobForm = field(r'td select:eq(1) :selected').text()
#第二行
fieldpattern = r'table > tr:eq({0})'.format(1)
field = basicTable(fieldpattern)
self.people.jobOfficalName = field(r'td:eq(0) :selected').text() if field(r'td:eq(0) :selected').text() != "" else "无单位"
self.people.labourContract = field(r'td:eq(1) :selected').text()
#第三行
fieldpattern = r'table > tr:eq({0})'.format(2)
field = basicTable(fieldpattern)
self.people.unemploymentCase = field(r'td:eq(0) :selected').text()
self.people.unemploymentReason = field(r'td:eq(1) :selected').text()
######出现内容串了应该是多线程引起的,你改成单线程应该就不会出现了,我看了下代码里面没有一个比较好的线程同步方式,要用多线程最好还是和queue配合着一起用,这样控制线程数都方便点,应该也不会出现你目前的情况######是说一开始的几章没问题,一旦下载强度大了以后,每一段的末尾会随机出现全文中的任意几个字,单独抓一章不会出问题