本文仅供学习交流使用,如侵立删! |
二手车之家车辆档案数据爬虫
先上效果图
环境
- win10
- python3.9
- lxml、retrying、requests
需求分析
需求:主要是需要车辆详情页中车辆档案的数据先抓包分析一波,网页抓包没有什么有用的,转战APP
拿到数据接口就简单了,直接构造请求保存数据即可
获取车辆信息列表
def_get_car_list(self, _url: str):
"""
获取二手车信息列表
"""
res = self._parse_url(_url=_url)
ret = res.text # 解析获得字符串类型数据
result = etree.HTML(ret) # 转换数据类型为HTML,方便使用xpath
url_list = result.xpath('//*[@id="goodStartSolrQuotePriceCore0"]/ul/li/a/@href')
ifnoturl_list:
print('获取完成!')
return
foriinurl_list:
# 有些车型url直接是带域名的
if'www.che168.com/'ini:
yield'https://'+i[2:]
else:
yield'https://www.che168.com'+i
获取车辆详情信息
def_get_car_info(self, _url: str):
"""
获取车辆详情信息
"""
res = self._parse_url(_url=_url)
ret = res.text # 解析获得字符串类型数据
result = etree.HTML(ret) # 转换数据类型为HTML,方便使用xpath
# 标题
title = result.xpath('//div[@class="car-box"]/h3//text()')
title = title[1].strip() iflen(title) >1elsetitle[0].strip()
# 上牌时间
play_time = result.xpath('//*[@id="nav1"]/div[1]/ul[1]/li[1]/text()')
play_time = play_time[0].strip() ifplay_timeelse'-'
# 表显里程
display_mileage = result.xpath('//*[@id="nav1"]/div[1]/ul[1]/li[2]/text()')
display_mileage = display_mileage[0].strip() ifdisplay_mileageelse'-'
# 变速箱
gearbox = result.xpath('//*[@id="nav1"]/div[1]/ul[1]/li[3]/text()')
gearbox = gearbox[0].strip() ifgearboxelse'-'
# 排放标准
emission_standards = result.xpath('//*[@id="nav1"]/div[1]/ul[1]/li[4]/text()')
emission_standards = emission_standards[0].strip() ifemission_standardselse'-'
# 排量
displacement = result.xpath('//*[@id="nav1"]/div[1]/ul[1]/li[5]/text()')
displacement = displacement[0].strip() ifdisplacementelse'-'
# 发布时间
release_time = result.xpath('//*[@id="nav1"]/div[1]/ul[1]/li[6]/text()')
release_time = release_time[0].strip() ifrelease_timeelse'-'
# 年检到期
annual_inspection_expires = result.xpath('//*[@id="nav1"]/div[1]/ul[2]/li[1]/text()')
annual_inspection_expires = annual_inspection_expires[0].strip() ifannual_inspection_expireselse'-'
# 保险到期
insurance_expires = result.xpath('//*[@id="nav1"]/div[1]/ul[2]/li[2]/text()')
insurance_expires = insurance_expires[0].strip() ifinsurance_expireselse'-'
# 质保到期
warranty_expires = result.xpath('//*[@id="nav1"]/div[1]/ul[2]/li[3]/text()')
warranty_expires = warranty_expires[0].strip() ifwarranty_expireselse'-'
# 过户次数
number_of_transfers = result.xpath('//*[@id="nav1"]/div[1]/ul[2]/li[5]/text()')
number_of_transfers = number_of_transfers[0].strip() ifnumber_of_transferselse'-'
# 所在地
location = result.xpath('//*[@id="nav1"]/div[1]/ul[2]/li[6]/text()')
location = location[0].strip() iflocationelse'-'
# 发动机
engine = result.xpath('//*[@id="nav1"]/div[1]/ul[3]/li[1]/text()')
engine = engine[0].strip() ifengineelse'-'
# 车辆级别
vehicle = result.xpath('//*[@id="nav1"]/div[1]/ul[3]/li[2]/text()')
vehicle = vehicle[0].strip() ifvehicleelse'-'
# 车身颜色
car_color = result.xpath('//*[@id="nav1"]/div[1]/ul[3]/li[3]/text()')
car_color = car_color[0].strip() ifcar_colorelse'-'
# 燃油标号
fuel_label = result.xpath('//*[@id="nav1"]/div[1]/ul[3]/li[4]/text()')
fuel_label = fuel_label[0].strip() iffuel_labelelse'-'
# 驱动方式
drive_mode = result.xpath('//*[@id="nav1"]/div[1]/ul[3]/li[5]/text()')
drive_mode = drive_mode[0].strip() ifdrive_modeelse'-'
data = [[title, play_time, display_mileage, gearbox, emission_standards, displacement, release_time, annual_inspection_expires,
insurance_expires, warranty_expires, number_of_transfers, location, engine, vehicle, car_color, fuel_label, drive_mode, _url]]
print(data)
self._save_csv(data=data)
资源下载
https://download.csdn.net/download/qq_38154948/85358088
本文仅供学习交流使用,如侵立删! |