# -*-coding:utf-8-*- html = """ <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> </div> </body> </html> """ from scrapy.selector import Selector sel = Selector(text=html) print("================title===============") title_by_xpath = sel.xpath("//title//text()").extract_first() print(title_by_xpath) title_by_css = sel.css("title::text").extract_first() print(title_by_css) print("================href===============") hrefs = sel.xpath("//a/@href").extract() print(hrefs) hrefs_by_css = sel.css("a::attr(href)").extract() print(hrefs_by_css) print("================img===============") imgs = sel.xpath("//a[contains(@href, 'image')]/@href").extract() print(imgs) imgs_by_css = sel.css("a[href*=image]::attr(href)").extract() print(imgs_by_css) print("================src===============") src = sel.xpath("//a[contains(@href, 'image')]/img/@src").extract() print(src) src_by_css = sel.css("a[href*=image] img::attr(src)").extract() print(src_by_css) print("================ re ===============") text_by_re = sel.css("a[href*=image]::text").re(r"Name:\s*(.*)") print(text_by_re) print("================ xpath ===============") div = sel.xpath("//div") # 相对路径 print(div) a = div.xpath(".//a").extract() # 从当前提取所有元素 print(a) print("================ text ===============") text='<a href="#">Click here to go to the <strong>Next Page</strong></a>' sel1 = Selector(text=text) # a下面的文字 a = sel1.xpath("//a/text()").extract() print(a) # a 下面所有的文字,包括strong a = sel1.xpath("//a//text()").extract() print(a) # 解析出所有文字内容 a = sel1.xpath("string(//a)").extract() print(a) a = sel1.xpath("string(.)").extract() print(a) # 简化写法,推荐 xp = lambda x: sel.xpath(x).extract() all_a = xp("//a/text()") print(all_a)