代码查看下载
https://github.com/chrunlee/citys
github 上可以直接下载查看
代码来源
最初的代码不是我做的,是来自另外一个nodejs版本的抓取,不过数据存储是sqllite ,如果想看的话,可以点击这里https://github.com/modood/Administrative-divisions-of-China
用到的nodejs模块
iconv-lite
bufferhelper
sqlquery-tool
http
不想去github上看的,可以直接看代码
入口函数 fetch.js
const sql = require('./sql')
const worker = require('./worker')
async function main () {
await sql.init();
await worker.fetchVillages()
await worker.patch()
console.log('[100%] 数据抓取完成!')
}
main().then(() => process.exit(0)).catch(e => {
console.log(e)
process.exit(-1)
})
数据库连接及初始化 sql.js
var query = require('sqlquery-tool');
//mysql 数据库链接配置
query.query({
host : '127.0.0.1',
port : '3306',
user : 'root',
password : 'root',
database : 'citys'
});
//表清空
const drop1 = `drop table if exists province;`,
drop2 = `drop table if exists city;`,
drop3 = `drop table if exists area;`,
drop4 = `drop table if exists street;`,
drop5 = `drop table if exists village;`;
//创建表
const province =
`
create table province (
code varchar(50) ,
name varchar(100)
)
`,
city =
`
create table city (
code varchar(50),
name varchar(100),
pcode varchar(50)
)
`,
area =
`
create table area (
code varchar(50),
name varchar(100),
ccode varchar(50),
pcode varchar(50)
)
`,
street =
`
create table street (
code varchar(50),
name varchar(100),
acode varchar(50),
ccode varchar(50),
pcode varchar(50)
)
`,
village =
`
create table village (
code varchar(50),
name varchar(100),
scode varchar(50),
acode varchar(50),
ccode varchar(50),
pcode varchar(50)
)
`;
module.exports.init = async ()=>{
await query.query({sql : drop1,params : []})
.then(()=>{
return query.query({sql : drop2,params : []})
})
.then(()=>{
return query.query({sql : drop3,params : []})
})
.then(()=>{
return query.query({sql : drop4,params : []})
})
.then(()=>{
return query.query({sql : drop5,params : []})
})
.then(()=>{
return query.query({sql : province,params : []})
})
.then(()=>{
return query.query({sql : city,params : []})
})
.then(()=>{
return query.query({sql : area,params : []})
})
.then(()=>{
return query.query({sql : street,params : []})
})
.then(()=>{
return query.query({sql : village,params : []})
})
}
数据插入保存 worker.js
const crawler = require('./crawler')
console.log('enter data fetch')
var sqlquery = require('sqlquery-tool');
/**
* 抓取所有省级数据
* @author https://github.com/modood
* @datetime 2018-01-31 22:11
*/
exports.fetchProvinces = async () => {
console.log('[1/1]正在抓取省级数据...')
const o = await crawler.fetchProvinces()
const rows = []
for (const code in o) {
const name = o[code]
rows.push({ code, name })
}
await sqlquery.search('province').insert(rows);
}
/**
* 抓取所有地级数据
* @author https://github.com/modood
* @datetime 2018-01-31 22:13
*/
exports.fetchCities = async () => {
await exports.fetchProvinces()
const list = await sqlquery.search('province').list();
const count = list.length;
const rows = [];
for(var i=0;i<list.length;i++){
var p = list[i];
console.log(`[${i+1}/${count}]正在抓取地级数据,当前省级:${p.code} ${p.name}`)
const o = await crawler.fetchCities(p.code)
for (const code in o) {
const name = o[code]
rows.push({ code, name, pcode : p.code })
}
}
await sqlquery.search('city').insert(rows);
}
/**
* 获取所有县级数据
* @author https://github.com/modood
* @datetime 2018-02-01 09:12
*/
exports.fetchAreas = async () => {
await exports.fetchCities()
const list = await sqlquery.search('city').list();
const count = list.length;
for(let i=0;i<count;i++){
const rows = [];
var data = list[i];
console.log(`[${i+1}/${count}]正在抓取县级数据,当前地级:${data.code} ${data.name}`)
if (['4420', '4419', '4604'].includes(data.code)) continue
const o = await crawler.fetchAreas(data.code)
for (const code in o) {
const name = o[code]
rows.push({ code, name, ccode : data.code, pcode : data.pcode })
}
if(rows.length > 0){
await sqlquery.search('area').insert(rows);
}
}
// 特殊处理:广东省中山市(3320)、广东省东莞市(4419)、海南省儋州市(4604)没有县级,
// 需要手动插入。
var area2 = [
{ code: '441900', name: '东莞市', ccode: '4419', pcode: '44' },
{ code: '442000', name: '中山市', ccode: '4420', pcode: '44' },
{ code: '460400', name: '儋州市', ccode: '4604', pcode: '46' }
];
await sqlquery.search('area').insert(area2);
await sqlquery.search('area').where({code : '620201'}).update({name : '嘉峪关市'});
}
/**
* 获取所有乡级数据
* @author https://github.com/modood
* @datetime 2018-02-01 09:28
*/
exports.fetchStreets = async () => {
await exports.fetchAreas();
const list = await sqlquery.search('area').list();
const count = list.length;
for(let i=0;i<count;i++){
const rows = [];
var data = list[i],areaName = data.name,areaCode = data.code,cityCode = data.ccode,provinceCode = data.pcode;
console.log(`[${i+1}/${count}]正在抓取乡级数据,当前县级:${areaCode} ${areaName}`)
if ((areaName === '市辖区' && !['620201', '460201'].includes(areaCode)) || ['350527'].includes(areaCode)) continue
let route
if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cityCode}`
const o = await crawler.fetchStreets(areaCode, route)
for (const code in o) {
const name = o[code]
rows.push({ code, name, acode : areaCode, ccode : cityCode, pcode : provinceCode })
}
if(rows.length > 0){
await sqlquery.search('street').insert(rows);
}
}
}
/**
* 抓取所有村级数据
* @author https://github.com/modood
* @datetime 2018-02-01 09:47
*/
exports.fetchVillages = async () => {
await exports.fetchStreets()
const list = await sqlquery.search('street').list();
const count = list.length;
for(let i=0;i<count;i++){
const rows = [];
var data = list[i],streetCode = data.code,streetName = data.name,cityCode = data.ccode,provinceCode = data.pcode,areaCode = data.acode;
console.log(`[${i+1}/${count}]正在抓取村级数据,当前乡级:${streetCode} ${streetName}`)
let route
const cCodeSuffix = cityCode.substr(2, 2)
if (['4420', '4419', '4604'].includes(cityCode)) route = `${provinceCode}/${cCodeSuffix}/${streetCode}`
const o = await crawler.fetchVillages(streetCode, route)
for (const code in o) {
const name = o[code]
rows.push({ code, name, scode : streetCode, acode : areaCode, ccode : cityCode, pcode : provinceCode })
}
if(rows.length > 0){
await sqlquery.search('village').insert(rows);
}
}
}
/**
* 补漏
* @author https://github.com/modood
* @datetime 2018-02-02 13:39
*/
exports.patch = async () => {
// 特殊处理:福建省泉州市金门县(350527)没有乡级导致没有匹配上爬取县级的正则表达式。
// 手动插入县级、乡级、村级
const areas = [
{ code: '350527', name: '金门县', ccode: '3505', pcode: '35' }
]
const streets = [
{ code: '350527000', name: '金门县', acode: '350527', ccode: '3505', pcode: '35' }
]
const villages = [
{ code: '350527000000', name: '金门县', scode: '350527000', acode: '350527', ccode: '3505', pcode: '35' }
]
await sqlquery.search('area').insert(areas);
await sqlquery.search('street').insert(streets);
await sqlquery.search('village').insert(villages);
}
数据抓取 crawler.js
const http = require('http')
const iconv = require('iconv-lite')
const BufferHelper = require('bufferhelper')
/*
* 命名简写备注
*
* 省级(省份,Province) p
* 地级(城市,City) c
* 县级(区县,Area) a
* 乡级(乡镇街道,Street) s
* 村级(村委会居委会,Village) v
*/
const pReg = /<td><a href='(.*?).html'>(.*?)<br\/><\/a><\/td>/g
const casReg = /<tr class='.*?'><td><a href=.*?>(.*?)<\/a><\/td><td><a href=.*?>(.*?)<\/a><\/td><\/tr>/g
const vReg = /<td>(.*?)<\/td><td>.*?<\/td><td>(.*?)<\/td>/g
const host = 'www.stats.gov.cn'
const path = '/tjsj/tjbz/tjyqhdmhcxhfdm/2018/#{route}.html'
/**
* 抓取数据
* @author modood <https://github.com/modood>
* @datetime 2018-01-31 19:23
*/
exports.fetch = (host, route, regexp, codeLen) =>
new Promise((resolve, reject) => http.get({
host,
path: path.replace('#{route}', route),
timeout: 3000
}, res => {
const bufferHelper = new BufferHelper()
const statusCode = res.statusCode
if (statusCode !== 200) {
res.resume()
return reject(new Error('Request Failed. Status Code: ' + statusCode))
}
res.on('data', chunk => bufferHelper.concat(chunk))
res.on('end', () => {
const rawData = iconv.decode(bufferHelper.toBuffer(), 'GBK')
const result = {}
let current
while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
return resolve(result)
})
}).on('error', reject).on('timeout', () => reject(new Error('timeout'))))
/**
* 抓取省级数据
* @author modood <https://github.com/modood>
* @datetime 2018-01-31 19:40
*/
exports.fetchProvinces = async () => {
try {
return await exports.fetch(host, 'index', pReg, 2)
} catch (err) {
if (err.message !== 'timeout') console.log(`抓取省级数据失败(${err}),正在重试...`)
return exports.fetchProvinces()
}
}
/**
* 抓取地级数据
* @author modood <https://github.com/modood>
* @datetime 2018-01-31 19:51
*/
exports.fetchCities = async (pCode) => {
try {
return await exports.fetch(host, pCode, casReg, 4)
} catch (err) {
if (err.message !== 'timeout') console.log(`抓取省级(${pCode})的地级数据失败(${err}),正在重试...`)
return exports.fetchCities(pCode)
}
}
/**
* 抓取县级数据
* @author modood <https://github.com/modood>
* @datetime 2018-01-31 20:03
*/
exports.fetchAreas = async (cCode) => {
cCode = cCode.toString()
const pCode = cCode.substr(0, 2)
try {
return await exports.fetch(host, `${pCode}/${cCode}`, casReg, 6)
} catch (err) {
if (err.message !== 'timeout') console.log(`抓取地级(${cCode})的县级数据失败(${err}),正在重试...`)
return exports.fetchAreas(cCode)
}
}
/**
* 抓取乡级数据
* @author modood <https://github.com/modood>
* @datetime 2018-01-31 20:08
*/
exports.fetchStreets = async (aCode, route) => {
aCode = aCode.toString()
const pCode = aCode.substr(0, 2)
const cCodeSuffix = aCode.substr(2, 2)
const _route = route || `${pCode}/${cCodeSuffix}/${aCode}`
try {
return await exports.fetch(host, _route, casReg, 9)
} catch (err) {
if (err.message !== 'timeout') console.log(`抓取县级(${aCode})的乡级数据失败(${err}),正在重试...`)
return exports.fetchStreets(aCode, route)
}
}
/**
* 抓取村级数据
* @author modood <https://github.com/modood>
* @datetime 2018-01-31 20:19
*/
exports.fetchVillages = async (sCode, route) => {
sCode = sCode.toString()
const pCode = sCode.substr(0, 2)
const cCodeSuffix = sCode.substr(2, 2)
const aCodeSuffix = sCode.substr(4, 2)
const _route = route || `${pCode}/${cCodeSuffix}/${aCodeSuffix}/${sCode}`
try {
return await exports.fetch(host, _route, vReg, 12)
} catch (err) {
if (err.message !== 'timeout') console.log(`抓取乡级(${sCode})的村级数据失败(${err}),正在重试...`)
return exports.fetchVillages(sCode, route)
}
}
关于数据库表结构部分,各位可以自行修改,注意在 worker.js 中修改对应的name字段。