开发者社区> 问答> 正文

python写的抓取网站书籍的程序,平时没问题,一抓那些章节多的书籍,每一段的末尾就会出现错乱报错 

RT请问问题出在哪里?为了安全起见,我把用户名密码改掉了。

#-*- coding:GB18030 -*-
import sys
import os
import re
import urllib.request
import http.cookiejar
import threading




urllogin = 'http://bbs.artx.cn/logging.php?action=login&loginsubmit=yes&inajax=1'
cj = http.cookiejar.CookieJar()
#建立新的opener
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
#装载新的opener
urllib.request.install_opener(opener)
#盛放PostData的字典
postDict = {
    'formhash' : '00e0e70f',
    'referer' : 'http%3A%2F2Fguji.artx.cn%2F',
    'loginfield' : 'username',
    'username' : '********',
    'password' : '********',
    'questionid' : '0',
    'answer' : '',
    'cookietime' : '2592000',
            }
#将字典中的PostData编译成url格式,再转码为UTF-8
postData = urllib.parse.urlencode(postDict).encode('utf-8')
#带着PostData访问登录页面
resp = urllib.request.urlopen(urllogin, postData)
html = resp.read()
resp2 = urllib.request.urlopen('http://guji.artx.cn/', postData)
    
def main():
    chooseop = input('请选择操作:\n1.解析单本书籍介绍页面\n2.解析书籍专题页面\n3.退出程序\n')
    if chooseop == '1':
        processurl(input('请输入要抓取的文章主页面的地址:\n'),1)
    elif chooseop == '2':
        processsub(input('请输入要抓取的专题页面的地址:\n'))
    elif chooseop == '3':
        sys.exit()
#处理书籍介绍页
def processurl(url,type):
    response = urllib.request.urlopen(url)
    html = response.read()
    #将html解码出来
    uhtml = html.decode('utf-8')
    #截取所有章节页面的URL
    urls = re.findall('(?<=<li><a href=\").*\.html(?=\">)',uhtml)
    #截取所有章节标题
    titles = re.findall('(?<=\.html\">).*(?=</a></li>)',uhtml)
    #将章节标题里的空格占位符换为空格
    for i in titles:
        i = i.replace(' ',' ')
        i = i.replace('(','(')
        i = i.replace(')',')')
    #截取文章总标题
    titleinlist = re.findall('(?<=title"><h3>).*(?=</h3></div>)',uhtml)
    #截取文章所属的库
    kuinlist = re.findall('(?<=\.html>).库(?=\</a> )',uhtml)
    #截取专题名
    kindinlist = re.findall('(?<=showmain_kind_z>).*?(?=</a>)', uhtml)
    kind = kindinlist[0]
    ku = kuinlist[0]
    title = titleinlist[0]
    if len(urls) == len(titles):
        processurl2(url, '简介', title, ku, kind)
        if len(urls) < 5:
            for i in range(len(urls)):
                processurl2("http://guji.artx.cn" + urls[i], titles[i], title, ku, kind)
            if type == 1:
                    main()
        else:
            t1 = ''
            t2 = ''
            t3 = ''
            t4 = ''
            num = len(urls)
            every = num // 4
            mod = num % 4
            #执行分段,分段
            urlsplit1 = urls[0:every]
            urlsplit2 = urls[every:every*2]
            urlsplit3 = urls[every*2:every*3]
            urlsplit4 = urls[every*3:every*4+mod]
            titlesplit1 = titles[0:every]
            titlesplit2 = titles[every:every*2]
            titlesplit3 = titles[every*2:every*3]
            titlesplit4 = titles[every*3:every*4+mod]
            print ("解析出的链接数和章节数相等,匹配正确!\n")
            thread1 = Thread(1, 1, urlsplit1, titlesplit1, title, ku, kind)
            thread2 = Thread(2, 2, urlsplit2, titlesplit2, title, ku, kind)
            thread3 = Thread(3, 3, urlsplit3, titlesplit3, title, ku, kind)
            thread4 = Thread(4, 4, urlsplit4, titlesplit4, title, ku, kind)
            thread1.start()
            thread2.start()
            thread3.start()
            thread4.start()
            if type == 1:
                main()


    else:
        print ("解析出的章节数和链接数不相等,可能存在错误!\n")
    #如果是抓取单本书,返回主操作,否则跳过
    


#处理文本            
def text(i):
    #大致截取出正文文本
    text1 = re.findall('(?<=font-size:14px;\">).*?(?=</div>)',i,re.DOTALL)
    #删除文本中的阅读笔记代码
    garbages1 = re.findall('<font class=bj_style>.*?</a></font>',text1[0],re.DOTALL)
    for g1 in garbages1:
        text1[0] = text1[0].replace(g1,'\n  ')
    #删除文本中的‘中国古籍全录’代码
    garbages2 = re.findall('<a href=.http.*?</a>',text1[0],re.DOTALL)
    for g2 in garbages2:
        text1[0] = text1[0].replace(g2,'')
    #删除文本中的<font class=***>
    garbages3 = re.findall('<font class=.*?>',text1[0],re.DOTALL)
    for g3 in garbages3:
        text1[0] = text1[0].replace(g3,'')
    #删除文本中的注释
    garbages4 = re.findall('<a href=.*?</a>',text1[0],re.DOTALL)
    for g4 in garbages4:
        text1[0] = text1[0].replace(g4,'')
    #删除文本中的</strong>
    text1[0] = text1[0].replace('</strong>','')
    #删除文本中的<strong>
    text1[0] = text1[0].replace('<strong>','')
    #删除文本中剩余的</font>
    text1[0] = text1[0].replace('</font>','')
    #删除文本中剩余的<br>
    text1[0] = text1[0].replace("<br>","")
    #删除文本中的空格占位符
    text1[0] = text1[0].replace(" ","")
    #把文本中的?替换为问号
    text1[0] = text1[0].replace("?","?")
    #把文本中的"替换为双引号
    text1[0] = text1[0].replace(""","\"")
    return text1[0]
#处理专题
def processsub(url):
    response = urllib.request.urlopen(url)
    html = response.read()
    #将html解码出来
    uhtml = html.decode('utf-8')
    urls = re.findall('(?<=<a href=\").*?html(?=\" title=)',uhtml)
    titles = re.findall('(?<=\.html\" title=\").*?(?=\" target=_blank>)',uhtml,re.DOTALL)
    numt = len(titles)
    if numt == len(urls):
        print ('解析出的书籍数与链接数相等,匹配正确!\n')
        #删除书名中的乱码
        for i in titles:
            i = i.replace(' ',' ')
            i = i.replace('(','(')
            i = i.replace(')',')')
        subinlist = re.findall('(?<=html">).{2,10}(?=</a></div>)',uhtml)
        print ('您要下载的专题是:\n',subinlist[0],'\n其中的书籍有:\n',titles)
        global thread1
        global thread2
        global thread3
        global thread4
        for i in urls:
            do = processurl(i,2)
            #while thread1.isAlive == False and thread2.isAlive == False and thread3.isAlive == False and thread4.isAlive == False:
                #continue
    else:
        print ('解析出的书籍数和链接数不相等,可能存在错误!\n')
        


#多线程对象
class Thread(threading.Thread):
    def __init__(self, num, interval, urlsplit, titlesplit, title, ku, kind):
        threading.Thread.__init__(self)  
        self.thread_num = num
        self.interval = interval
        self.thread_stop = False
        self.urlsplit = urlsplit
        self.titlesplit = titlesplit
        self.title = title
        self.ku = ku
        self.kind = kind
    #多线程重复调用processurl2
    def run(self):
        while self.thread_stop == False:
            
            for i in range(len(self.urlsplit)):
                url1 = self.urlsplit[i]
                title1 = self.titlesplit[i]
                processurl2("http://guji.artx.cn" + url1, title1, self.title, self.ku, self.kind)
            self.stop()
            
    def stop(self):  
        self.thread_stop = True


        


#处理子页面,urls是url,titles是章节名,title是书的总标题,ku书籍所属库,kind是专题名
def processurl2(urls, titles, title, ku, kind):
    #try:
        response1 = urllib.request.urlopen(urls)
        html1 = response1.read()
        uhtml1 = html1.decode('utf-8')
        #判断以库名和书名命名的文件夹是否存在,若不存在则创建
        if os.path.exists('E:/downloadedbooks/' + ku + '/' + kind +  '/' + title) == False:
            os.makedirs('E:/downloadedbooks/' + ku + '/' + kind + '/' + title)
        else:
                pass
        #获取文章内容
        article = text(uhtml1)
        #在目录下以书名为文件名,以GB18030为默认编码创建TXT并写入内容
        f = open('E:/downloadedbooks/' + ku + '/' + kind + '/' + title + '/' + titles + '.txt','w',encoding='GB18030')
        f.write(str(article))
        f.close()
        print (titles, '.........下载完成.')
    #except:
        #print('本章出现异常,请手工处理!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')


main()

展开
收起
kun坤 2020-06-03 11:29:00 655 0
1 条回答
写回答
取消 提交回答
  • 用pyquery,对html操作的时候更合适的是jquery的方式而不是用regex。
    才写的一个爬虫,抓业务数据的,其中一段:

        def parseUpperSixteen(self, pqobj):        
            basicTable = pqobj(r'td > table > tr:eq(1) td table')
            # 第一行
            fieldpattern = r'table > tr:eq({0})'.format(0)
            field = basicTable(fieldpattern)
            self.people.jobSituation = field(r'td select:eq(0) :selected').text()
            self.people.jobSituationDetail = field(r'td select input').val() if field(r'td select input').val() is not None else ""
            self.people.jobForm = field(r'td select:eq(1) :selected').text()
            #第二行
            fieldpattern = r'table > tr:eq({0})'.format(1)
            field = basicTable(fieldpattern)
            self.people.jobOfficalName = field(r'td:eq(0) :selected').text() if field(r'td:eq(0) :selected').text() != "" else "无单位"
            self.people.labourContract = field(r'td:eq(1) :selected').text()
            #第三行
            fieldpattern = r'table > tr:eq({0})'.format(2)
            field = basicTable(fieldpattern)
            self.people.unemploymentCase = field(r'td:eq(0) :selected').text()
            self.people.unemploymentReason = field(r'td:eq(1) :selected').text()

    ######出现内容串了应该是多线程引起的,你改成单线程应该就不会出现了,我看了下代码里面没有一个比较好的线程同步方式,要用多线程最好还是和queue配合着一起用,这样控制线程数都方便点,应该也不会出现你目前的情况######是说一开始的几章没问题,一旦下载强度大了以后,每一段的末尾会随机出现全文中的任意几个字,单独抓一章不会出问题

    2020-06-03 11:29:11
    赞同 展开评论 打赏
问答排行榜
最热
最新

相关电子书

更多
From Python Scikit-Learn to Sc 立即下载
Data Pre-Processing in Python: 立即下载
双剑合璧-Python和大数据计算平台的结合 立即下载