LinkExtractor
from scrapy.linkextractors import LinkExtractor
Link
from scrapy.link import Link
Link四个属性
url text fragment nofollow
如果需要解析出文本,需要在 LinkExtractor 的参数中添加参数:attrs
link_extractor = LinkExtractor(attrs=('href','text')) links = link_extractor.extract_links(response)
使用示例
import scrapy from scrapy.linkextractors import LinkExtractor class DemoSpider(scrapy.Spider): name = 'spider' start_urls = [ "https://book.douban.com/" ] def parse(self, response): # 参数是正则表达式 link_extractor = LinkExtractor(allow="https://www.tianyancha.com/brand/b.*") links = link_extractor.extract_links(response) for link in links: print(link.text, link.url) if __name__ == '__main__': cmdline.execute("scrapy crawl spider".split())