创建项目
neo@MacBook-Pro ~/Documents % scrapy startproject photo
neo@MacBook-Pro ~/Documents % cd photo
安装依赖库
neo@MacBook-Pro ~/Documents/photo % pip3 install image
创建爬虫
neo@MacBook-Pro ~/Documents/photo % scrapy genspider jiandan jandan.net
忽略 robots.txt 规则
# Obey robots.txt rules ROBOTSTXT_OBEY = False
配置图片保存路径与缩图
#图片保存路径 IMAGES_STORE='/tmp/photo' #DOWNLOAD_DELAY = 0.25 #缩略图的尺寸,设置这个值就会产生缩略图 IMAGES_THUMBS = { 'small': (50, 50), 'big': (200, 200), }
加入 process_item()与 item_completed() 方法
注意:PhotoPipeline(ImagesPipeline) 需要继承 ImagesPipeline
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem class PhotoPipeline(ImagesPipeline): # def process_item(self, item, spider): # return item def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.http.Request('http:'+image_url) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item
忽略 robots.txt 规则
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class PhotoItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #图片的链接 image_urls = scrapy.Field() images = scrapy.Field() image_paths = scrapy.Field() pass
# -*- coding: utf-8 -*- import scrapy from scrapy.loader import ItemLoader from photo.items import PhotoItem class JiandanSpider(scrapy.Spider): name = 'jiandan' # allowed_domains = ['jandan.net'] allowed_domains = [] start_urls = ['http://jandan.net/ooxx'] def parse(self, response): l = ItemLoader(item=PhotoItem(), response=response) l.add_xpath('image_urls','//img//@src' ) yield l.load_item() next_page = response.xpath('//a[@class="previous-comment-page"]//@href').extract_first() #翻页 if next_page: yield response.follow(next_page,self.parse) pass def parse_page(self, response): l = ItemLoader(item=PhotoItem(), response=response) l.add_xpath('image_urls','//img//@src' ) return l.load_item()
原文出处:Netkiller 系列 手札
本文作者:陈景峯
转载请与作者联系,同时请务必标明文章原始出处和作者信息及本声明。