1、Spider
baidu_spider.py
from scrapy import Spider, cmdline class BaiduSpider(Spider): name = "baidu_spider" start_urls = [ "https://www.baidu.com/" ] custom_settings = { "SPIDER_DATA": "this is spider data", "DOWNLOADER_MIDDLEWARES": { "scrapys.mymiddleware.MyMiddleware": 100, }, "ITEM_PIPELINES": { "scrapys.mypipeline.MyPipeline": 100, }, "SPIDER_MIDDLEWARES":{ "scrapys.myspidermiddleware.MySpiderMiddleware": 100, } } def parse(self, response): pass if __name__ == '__main__': cmdline.execute("scrapy crawl baidu_spider".split())
2、Pipeline
mypipeline.py
class MyPipeline(object): def __init__(self, spider_data): self.spider_data = spider_data @classmethod def from_crawler(cls, crawler): """ 获取spider的settings参数,返回Pipeline实例对象 """ spider_data = crawler.settings.get("SPIDER_DATA") print("### pipeline get spider_data: {}".format(spider_data)) return cls(spider_data) def process_item(self, item, spider): """ return Item 继续处理 raise DropItem 丢弃 """ print("### call process_item") return item def open_spider(self, spider): """ spider开启时调用 """ print("### spdier open {}".format(spider.name)) def close_spider(self, spider): """ spider关闭时调用 """ print("### spdier close {}".format(spider.name))
3、Downloader-Middleware
mymiddleware.py
class MyMiddleware(object): def __init__(self, spider_data): self.spider_data = spider_data @classmethod def from_crawler(cls, crawler): """ 获取spider的settings参数,返回中间件实例对象 """ spider_data = crawler.settings.get("SPIDER_DATA") print("### middleware get spider_data: {}".format(spider_data)) return cls(spider_data) def process_request(self, request, spider): """ return None: 继续处理Request Response: 返回Response Request: 重新调度 raise IgnoreRequest: process_exception -> Request.errback """ print("### call process_request") def process_response(self, request, response, spider): """ return Response: 继续处理Response Request: 重新调度 raise IgnoreRequest: Request.errback """ print("### call process_response") return response def process_exception(self, request, exception, spider): """ return None: 继续处理异常 Response: 返回Response Request: 重新调用 """ pass
4、Spider-Middleware
myspidermiddleware.py
class MySpiderMiddleware(object): def __init__(self, spider_data): self.spider_data = spider_data @classmethod def from_crawler(cls, crawler): """ 获取spider的settings参数,返回中间件实例对象 """ spider_data = crawler.settings.get("SPIDER_DATA") print("### spider middleware get spider_data: {}".format(spider_data)) return cls(spider_data) def process_spider_input(self, response, spider): """ response通过时调用 return None 继续处理response raise Exception """ print("### call process_spider_input") def process_spider_output(self, response, result, spider): """ response返回result时调用 return iterable of Request、dict or Item """ print("### call process_spider_output") for i in result: yield i def process_spider_exception(self, response, exception, spider): """ return None iterable of Response, dict, or Item """ pass
运行爬虫后,查看日志
### middleware get spider_data: this is spider data ### spider middleware get spider_data: this is spider data ### pipeline get spider_data: this is spider data ### spdier open baidu_spider ### call process_request ### call process_response ### call process_spider_input ### call process_spider_output ### spdier close baidu_spider
根据日志输出信息,看到大致流程是和Scrapy数据流向图保持一致的
中间件启动顺序
处理函数调用顺序