三个文件代码如下:
spdier.py
# -*- coding: utf-8 -*- # author : pengshiyu # date : 2-18-4-19 import scrapy from scrapy.selector import Selector from tencent_position_item import TencentPositionItem import sys reload(sys) sys.setdefaultencoding("utf-8") class TencentPositionSpider(scrapy.Spider): name = "tencent_position" allowed_domains = ["tencent.com"] custom_settings = { "ITEM_PIPELINES":{ "myspider.tencent_position_spider.tencent_position_pipeline.TencentPositionPipeline": 100, } } start_urls =[ "https://hr.tencent.com/position.php" ] def parse(self, response): base_url = "https://hr.tencent.com/" rows = response.css(".even, .odd") # 或者使用xpath解析器 或 | ; 与 and # rows = response.xpath("//tr[@class='even'] | //tr[@class='odd']") for row in rows: position_name = row.xpath("./td[1]/a/text()").get() position_link = row.xpath("./td[1]/a/@href").get() position_type = row.xpath("./td[2]/text()").get() position_number = row.xpath("./td[3]/text()").get() work_location = row.xpath("./td[4]/text()").get() publish_time = row.xpath("./td[5]/text()").get() # 输出提取的信息 print "*"*30 print position_name print position_link print position_type print position_number print work_location print publish_time # 保存到item item = TencentPositionItem() item["position_name"] = position_name item["position_link"] = base_url + position_link item["position_type"] = position_type item["position_number"] = position_number item["work_location"] = work_location item["publish_time"] = publish_time yield item # 翻页, 下一页 # 方式1 正则匹配, 可能有的版本不能用re_first,那就用re regex = u'<a href="([^<]*)" id="next">下一页</a>' ret = Selector(response).re_first(regex, replace_entities=False) # 方式2 css选择器查找 next_url = response.css("#next::attr(href)").extract_first() if next_url != u"javascript:;": next_url = base_url + next_url print "下一页:", next_url yield scrapy.Request(url=next_url, callback=self.parse) else: print "最后一页了", next_url
item.py
# -*- coding:utf-8 -*- import scrapy class TencentPositionItem(scrapy.Item): position_name = scrapy.Field() # 职位名称 position_link = scrapy.Field() # 职位链接详情 position_type = scrapy.Field() # 职位类型 position_number = scrapy.Field() # 职位数量 work_location = scrapy.Field() # 工作地点 publish_time = scrapy.Field() # 发布时间
pipline.py
# -*- coding: utf-8 -*- import json import os BASE_DIR = os.path.abspath(__file__) class TencentPositionPipeline(object): def __init__(self): self.f = open("tencent_position.txt", "w") self.count = 0 def process_item(self, item, spider): content = json.dumps(dict(item), ensure_ascii=False)+"\n" self.f.write(content) self.count += 1 return item def close_spider(self, spider): print "爬取信息条数:{count}".format(count=self.count) self.f.close()