简易nodejs爬虫抓取博客园指定用户的文章及浏览量-阿里云开发者社区

简易nodejs爬虫抓取博客园指定用户的文章及浏览量

2016-11-30 1043

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介： 需要安装nodejs和cheerio模块实现了自定义用户，自定义页数，抓取完毕自动停止无重复可以按需修改文章类和评论的类名用法：首先 npm install cheerio 执行 node cnblog [username]文件结果保存在res/cnblog.

需要安装nodejs和cheerio模块

实现了自定义用户，自定义页数，抓取完毕自动停止无重复

可以按需修改文章类和评论的类名

用法：

首先 npm install cheerio

执行 node cnblog [username]

文件结果保存在res/cnblog.txt

//cnblog.js
var http = require('http')
var fs = require('fs')
var path = require('path')
var cheerio = require('cheerio')


var str = '';
var n = 1
var byte = 0;
grab({
	user:process.argv[2] || 'txxt',   /*配置博客园用户名*/
	pages:10,     /*配置要抓取的总页数*/
	cb:function(){saveFile(str) }, 
	postClass:'.day',   /*文章的类名*/
	commentClass:'.postDesc'  /*文章评论的类名*/
})


/*数据获取*/
function grab(opt) {
    var prefix = 'http://www.cnblogs.com/' + opt.user + '/default.html?page='
    opt.url = prefix + n;
	http.get(opt.url, function(res) {
		if(res.socket.bytesRead != byte){    //根据byte大小判断网页，防止重复
			var go = true;
			byte = res.socket.bytesRead
		} else {
            var go = false;
		}
        
		
		var html = '';
		res.on('data', function(data) {
			html += data;
		})
		res.on('end', function() {
			if(go){
				str += '\r\n' + '第' + n + '页开始' + '\r\n' 
				filter(html)
				str += '\r\n' +'>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' + '\r\n';
				console.log('第' + n + '页抓取完毕');
			}
		})

		res.on('end', function() {
			n = n + 1;
			if (n <= opt.pages && go) {
				
				grab(opt)
			} else {
				opt.cb && opt.cb()
			}
		})
	}).on('error', function() {
		console.log('获取数据出错')
	})

	function filter(html) {
		var $ = cheerio.load(html)
		var post = $(opt.postClass);
		post.each(function(item) {
			var title = $(this).find('.postTitle a').text();
			str += '\r\n' + title
            var foot = $(this).find(opt.commentClass).text();
            var reg = /\(\d+\)/
            var comment = reg.exec(foot)[0];
            comment = comment.replace('(','');
            comment = comment.replace(')','');
            str += '浏览量>>>>' + comment
		})
	}
}

function saveFile(content) {
	fpath = path.join(__dirname, './res/cnblog.txt')
	fs.writeFile(fpath, content, function(err) {
		if (err) {
			console.log('写入失败')
		}
		console.log(str)
	})
}

简易nodejs爬虫抓取博客园指定用户的文章及浏览量

热门文章

最新文章

相关课程

相关电子书

相关实验场景

热门

活动广场

任务中心

开发者评测

高校计划

乘风者计划

训练营

阿里云MVP

话题

直播

下载

镜像站

技术资料

插件

简易nodejs爬虫抓取博客园指定用户的文章及浏览量

热门文章

最新文章

相关课程

相关电子书

相关实验场景