为了抓取和讯网高管增减持的数据,首先得分析一下数据的来源:
网址:
使用chrome开发者工具,可以发现在切换到第二页时,浏览器向下述地址发起了网络访问请求:
分析一下上述链接, count表示一页返回的结果数目,page代表页码数,callback表示回调函数的名称.
以下是发起上述URL对应的网络请求返回的数据:
很明显,这是一段javascript代码,不是json数据,无法使用python进行直接解析.为了加快项目进度,减少耦合,可以使用nodejs一步完成,不用将这个数据爬取分为抓取和解析两个步骤.
为了加快爬取速度,我们设置每发起一次请求,返回1000条数据,在给定页码范围的情况下,就可以生成由所有链接构成的数组:
function get_url_array(start, end) {
var url_template = ""
var util = require("util")
var array = new Array()
for (var i = start; i <= end; i++) {
var url_one = util.format(url_template, i + 1)
array.push(url_one)
}
return array
}
对于给定链接,获取该链接的数据并将其转换为javascript对象,取出其中有价值的数据list,对应函数:
function get_data_from_url(url) {
var request = require('sync-request');
var user_agent_list =【
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56',
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 ",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 ",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 ",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 ",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 ",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 ",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 ",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 ",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 ",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 ",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 ",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 ",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
】
var pos =randomIntRange(0,user_agent_list.length-1)
// 增加user-agent
var res = request('GET', url, {
'headers': {
'user-agent':user_agent_list【pos】,
'Host': 'stockdata.stock.hexun.com',
'Referer': ''
},
retry : true,
retryDelay: 10000,
maxRetries: 5,
timeout:200000
});
var buf = res.getBody()
var iconv = require("iconv-lite")
// 使用gb2312编码方式
var data_str = iconv.decode(buf, 'gb2312')
data_str = data_str.replace(/上海市浦东新区公共交通投资发\/g,"上海市浦东新区公共交通投资发")
var data_list = eval(data_str)
return data_list.list
}
这里面有一个小坑,在大概处理第14个链接的时候,服务器返回的数据并不是正//代码效果参考:http://www.lyjsj.net.cn/wz/art_24203.html
确的javascript脚本,在此处有错误://wrong !
changePeopleTitle: '上海市浦东新区公共交通投资发\'
//right
changePeopleTitle: '上海市浦东新区公共交通投资发'
就因为多了一个转义符号,导致整个语句有问题,不能正确利用eval函数进行转换.这背后肯定是某位mm手残的结果.所以需要对这个bug特殊处理,对应上述代码的标红部分.这个抓取程序需要正确设置user-agent,为了防止被卡,我设置了user-agent池,利用random函数随机选取user-agent.
//随机生成范围在low,high之间的随机数
function randomIntRange (low, high) {
return Math.floor(Math.random() * (high - low + 1) + low);
}
为了解析javascript语句,需要设置和请求对应的回调函数,如下:
function hxbase_json5(str) {
var data = eval(str)
return data
}
解析javascript对象并将其存入数据库的操作定义在函数save_data_to_mysql() 中,其中利用了sequelize的orm模型来简化实现.
function save_data_to_mysql() {
var Sequelize = require('sequelize')
var sleep = require("sleep")
var sequelize = new Sequelize(
'dbname',
'root',
'passwd',
{
//代码效果参考: http://www.lyjsj.net.cn/wx/art_24201.html
'dialect': 'mysql',
'host': '127.0.0.1',
'port': 3306,
define: {
charset: 'utf8',
timestamps: false
//不定义时间戳
}
}
)
//高管增减持
var Ggzjc = sequelize.define(
'table_name', {
'stock_code': {//股票代码
'type': Sequelize.STRING,
'allowNull': false,
'unique': false
},
'stock_name': {//股票名称
'type': Sequelize.STRING,
'allowNull': false,
'unique': false
},
'changeDate': {//变动日期
'type': Sequelize.DATEONLY,
'allowNull': true
},
'noticeDate': {//公告日期
//代码效果参考: http://www.lyjsj.net.cn/wz/art_24199.html'type': Sequelize.DATEONLY,
'allowNull': true
},
'changeNum': {// 变动数量 万股
'type': Sequelize.DOUBLE,
'allowNull': true
},
'averagePrice': {//均价
'type': Sequelize.DOUBLE,
'allowNull': true
},
'price': {//金额
'type': Sequelize.DOUBLE,
'allowNull': true
},
'shareHoldingNum': {//变动后持股数目
'type': Sequelize.DOUBLE,
'allowNull': true
},
'changeRatio': {//变动人变动比
'type': Sequelize.DOUBLE,
'allowNull': true
},
'circulationCapitalRatio': {//占流通股本比例
'type': Sequelize.DOUBLE,
'allowNull': true
},
'changeWay': {//变动方式
'type': Sequelize.STRING,
'allowNull': true
},
'changePeople': {//股份变动人
'type': Sequelize.STRING,
'allowNull': true
},
'changePeopleTitle': {//相关董事高管
'type': Sequelize.STRING,
'allowNull': true
},
'duties': {//职务
'type': Sequelize.STRING,
'allowNull': true
},
'relation': {//关系
'type': Sequelize.STRING,
'allowNull': true
},
'industry': {//行业
'type': Sequelize.STRING,
'allowNull': true
},
}
)
Ggzjc.sync({force: true}).then(function () {
var url_array = get_url_array(1, 58)
for (var i = 0; i < url_array.length; i++) {
var data = get_data_from_url(url_array【i】)
sleep.usleep(200000)
print(i + 1)
print('complete!')
for (var j = 0; j < data.length; j++) {
var changeDate = '20' + data【j】.changeDate
var noticeDate = '20' + data【j】.noticeDate
var str_array = data【j】.stockName.split("(")
var stock_name = str_array【0】
str_array = str_array【1】.split(")")
var stock_code = str_array【0】
var changeNum = get_content_from_html(data【j】.changeNum)
if (changeNum != null) {
if (changeNum == "")
changeNum = null
else
changeNum = parseFloat(changeNum)
}
var averagePrice = data【j】.averagePrice
if (averagePrice == ' ')
averagePrice = null
else
averagePrice = parseFloat(averagePrice)
var price = get_content_from_html(data【j】.price)
if (price != null) {
if (price == '')
price = null
else
price = parseFloat(price)
}
var shareHoldingNum = data【j】.shareHoldingNum
if (shareHoldingNum == ' ')
shareHoldingNum = null
else
shareHoldingNum = parseFloat(shareHoldingNum)
var changeRatio = data【j】.changeRatio
if (changeRatio == ' ')
changeRatio = null
else
changeRatio = parseFloat(changeRatio)
var circulationCapitalRatio = data【j】.circulationCapitalRatio
if (circulationCapitalRatio == ' ')
circulationCapitalRatio = null
else
circulationCapitalRatio = parseFloat(circulationCapitalRatio)
var changeWay = data【j】.changeWay
if (changeWay == ' ')
changeWay = null
var changePeople = data【j】.changePeople
// console.log(data【j】.changePeople)
if (changePeople == ' ')
changePeople = null
var changePeopleTitle = data【j】.changePeopleTitle
if (changePeopleTitle == ' ')
changePeopleTitle = null
var duties = get_content_from_html(data【j】.duties)
if (duties != null && duties == '')
duties = null
var relation = get_content_from_html(data【j】.relation)
if (relation != null && relation == '')
relation = null
var industry = data【j】.industry
if (industry == ' ')
industry = null
var one = Ggzjc.build({
'stock_code': stock_code,
'stock_name': stock_name,
'changeDate': changeDate,
'noticeDate': noticeDate,
'changeNum': changeNum,
'averagePrice': averagePrice,
'price': price,
'shareHoldingNum': shareHoldingNum,
'changeRatio': changeRatio,
'circulationCapitalRatio': circulationCapitalRatio,
'changeWay': changeWay,
'changePeople': changePeople,
'changePeopleTitle': changePeopleTitle,
'duties': duties,
'relation': relation,
'industry': industry
})
one.save()
}
}
})
}
需要解析并获取html标签content中的content,利用正则表达式取出><中间的文本就可以了.
function get_content_from_html(str) {
var pattern = />【\s\S】+?
var res = str.match(pattern)
if (res == null) {
return null
}
var result = res【0】
return result.slice(1, result.length - 1)
}
done!
附注:
借助python execjs和pandas,我实现了以更加优美的姿势爬取上述内容,代码详见我的github:
我的github: