爬虫源码
def parse(self, response):
data = json.loads(response.text)['result']['data']
if data is None:
return
for str in data:
it_item = SinastarItem()
it_item['userid'] = str['_id']
it_item['name'] = str['title']
it_item['starurl'] = str['url']
it_item['pic'] = str['pic']
if str['birth_year'] != '' and str['birth_month'] != '' and str['birth_day'] != '':
it_item['birthday'] = str['birth_year'] + "/" + str['birth_month'] + "/" + str['birth_day']
else:
it_item['birthday'] = ''
it_item['xingzuo'] = str['astrology']
it_item['sex'] = str['gender']
it_item['profession'] = str['profession']
it_item['area'] = str['nationality']
it_item['height'] = str['height']
if not it_item['userid'] is None:
intro_url = 'http://ent.sina.com.cn/ku/star_detail_index.d.html?type=intro&id=' + it_item['userid']
base_url = 'http://ent.sina.com.cn/ku/star_detail_index.d.html?type=base&id=' + it_item['userid']
photo_url = 'http://ent.sina.com.cn/ku/star_detail_index.d.html?type=photo&id=' + it_item['userid']
yield scrapy.Request(intro_url,callback=self.info_item,meta={'item':it_item,'type':'intro'})
yield scrapy.Request(base_url, callback=self.info_item, meta={'item': it_item,'type':'base'})
yield scrapy.Request(photo_url, callback=self.photo_item, meta={'item': it_item})
#写真
def photo_item(self,response):
item = response.meta['item']
photoji = response.xpath("//*[@id='waterfall_roles']/li/a/img/@src").extract() ##取出写真集
plen = len(photoji)
if plen is not None and plen > 10:
imgurl = random.sample(photoji, 10) ##随机取list 10个数
item['imgurl'] = ','.join(imgurl)
else:
if photoji is not None:
item['imgurl'] = ','.join(photoji)
else:
item['imgurl'] = photoji
return item
## 简介
def info_item(self, response):
item = response.meta['item']
infodata = response.xpath("//div[@class='detail-base']/p/text()").extract()
if response.meta['type'] == 'intro': ##简介类型
item['intro'] = infodata
else:
item['base'] = infodata
return item
管道Pipeline 源码
def process_item(self, item, spider):
data = dict(item)
imgurl = data['imgurl']
base = data['base']
intro = data['intro']
userid = data['userid']
name = data['name']
sex = data['sex']
area = data['area']
xingzuo = data['xingzuo'] ##等于空
birthday = data['birthday'] ##等于空
height = data['height'] ##等于空
pic = data['pic']
profession = data['profession']
try:
onlysql = " select * from tw_cms_article_star where userid ='%s'" % data['userid'] # 查重复id
# 执行sql语句
self.cur.execute(onlysql)
# 是否有重复数据
repetition = self.cur.fetchone()
# 重复
if repetition is not None:
# 结果返回,已存在,则不插入
pass
else:
self.cur.execute("""insert into tw_cms_article_star (name,sex,area,xingzuo,birthday,height,pic,userid,intro,base,profession,imgurl)
values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ % [name,sex,area,xingzuo,birthday,height,pic,userid,intro,base,profession,imgurl])
#self.cur.execute(insersql)
# 提交sql语句
self.mydb.commit()
self.cur.close()
except Exception as error:
# 出现错误时打印错误日志
logging.error(error)
# 发生错误回滚
self.mydb.rollback()
self.mydb.close()

imgurl = data['imgurl']
base = data['base']
intro = data['intro']
这三个变量,由于是初始抓取页面分配到的参数,再抓下一层分配的数据
实现的目的:把初始页面抓到的数据+多个页面的数据,集合在一起,一次性入库。
问题描述:现在就是初始抓到数据,print后有三个数组,数据是初始页面的,还有其它页面,这样就导致第一次imgurl,base,intro都会不存在,keyerror,尝试判断不存在,还是一直报错,导致入库一直失败
求更好的解决方法~
版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。