开发者社区> 问答> 正文

python scrapy 多个不同的页面数据组合,管道被调用多次,导致字典报错?400报错

爬虫源码

def parse(self, response):
    data = json.loads(response.text)['result']['data']
    if data is None:
        return
    for str in data:
        it_item = SinastarItem()
        it_item['userid'] = str['_id']
        it_item['name'] = str['title']
        it_item['starurl'] = str['url']
        it_item['pic'] = str['pic']

        if  str['birth_year'] != '' and  str['birth_month'] != '' and str['birth_day'] != '':
            it_item['birthday'] = str['birth_year'] + "/" + str['birth_month'] + "/" + str['birth_day']
        else:
            it_item['birthday'] = ''
        it_item['xingzuo'] = str['astrology']
        it_item['sex'] = str['gender']
        it_item['profession'] = str['profession']
        it_item['area'] = str['nationality']
        it_item['height'] = str['height']
        if not it_item['userid'] is None:
            intro_url = 'http://ent.sina.com.cn/ku/star_detail_index.d.html?type=intro&id=' + it_item['userid']
            base_url = 'http://ent.sina.com.cn/ku/star_detail_index.d.html?type=base&id=' + it_item['userid']
            photo_url = 'http://ent.sina.com.cn/ku/star_detail_index.d.html?type=photo&id=' + it_item['userid']
            yield scrapy.Request(intro_url,callback=self.info_item,meta={'item':it_item,'type':'intro'})
            yield scrapy.Request(base_url, callback=self.info_item, meta={'item': it_item,'type':'base'})
            yield scrapy.Request(photo_url, callback=self.photo_item, meta={'item': it_item})

#写真
def photo_item(self,response):
    item = response.meta['item']
    photoji = response.xpath("//*[@id='waterfall_roles']/li/a/img/@src").extract()  ##取出写真集
    plen = len(photoji)
    if plen is not None and plen > 10:
        imgurl = random.sample(photoji, 10)  ##随机取list 10个数
        item['imgurl'] = ','.join(imgurl)
    else:
        if photoji is not None:
            item['imgurl'] = ','.join(photoji)
        else:
            item['imgurl'] = photoji
    return item

## 简介
def info_item(self, response):
    item = response.meta['item']
    infodata = response.xpath("//div[@class='detail-base']/p/text()").extract()
    if response.meta['type'] == 'intro':  ##简介类型
        item['intro'] = infodata
    else:
        item['base'] = infodata
    return item

管道Pipeline 源码

def process_item(self, item, spider):
    data = dict(item)
    imgurl = data['imgurl']
    base = data['base']
    intro = data['intro']
    userid = data['userid']
    name = data['name']
    sex = data['sex']
    area = data['area']
    xingzuo = data['xingzuo']  ##等于空
    birthday = data['birthday']  ##等于空
    height = data['height']  ##等于空
    pic = data['pic']
    profession = data['profession']

    try:
        onlysql = " select * from tw_cms_article_star where userid ='%s'" % data['userid']  # 查重复id
        # 执行sql语句
        self.cur.execute(onlysql)
        # 是否有重复数据
        repetition = self.cur.fetchone()
        # 重复
        if repetition is not None:
            # 结果返回,已存在,则不插入
            pass
        else:
            self.cur.execute("""insert into tw_cms_article_star (name,sex,area,xingzuo,birthday,height,pic,userid,intro,base,profession,imgurl) 
                                values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ % [name,sex,area,xingzuo,birthday,height,pic,userid,intro,base,profession,imgurl])
            #self.cur.execute(insersql)
            # 提交sql语句
            self.mydb.commit()
            self.cur.close()
    except Exception as error:
        # 出现错误时打印错误日志
        logging.error(error)
        # 发生错误回滚
        self.mydb.rollback()

    self.mydb.close()

imgurl = data['imgurl']

base = data['base']

intro = data['intro']

这三个变量,由于是初始抓取页面分配到的参数,再抓下一层分配的数据

实现的目的:把初始页面抓到的数据+多个页面的数据,集合在一起,一次性入库。

问题描述:现在就是初始抓到数据,print后有三个数组,数据是初始页面的,还有其它页面,这样就导致第一次imgurl,base,intro都会不存在,keyerror,尝试判断不存在,还是一直报错,导致入库一直失败

求更好的解决方法~

展开
收起
爱吃鱼的程序员 2020-06-08 11:05:53 988 0
0 条回答
写回答
取消 提交回答
问答排行榜
最热
最新

相关电子书

更多
From Python Scikit-Learn to Sc 立即下载
Data Pre-Processing in Python: 立即下载
双剑合璧-Python和大数据计算平台的结合 立即下载