1. Node.js 安装配置:
http://www.runoob.com/nodejs/nodejs-install-setup.html
2. 安装需要的相关包
执行命令: npm i cheerio async colors
什么是cheerio、async、colors?
- cheerio 是nodejs特别为服务端定制的,可以快速灵活的对JQuery核心进行实现。快速抓取页面,它工作于DOM模型上,且解析、操作、呈送都很高效。
https://www.npmjs.com/package/cheerio
- Async是一个实用程序模块,它为使用异步JavaScript提供了直观,强大的功能。虽然最初设计用于Node.js并可安装npm install --save async,但也可以直接在浏览器中使用。
https://www.npmjs.com/package/async
- colors是一个让你在node控制台输出丰富的样式与颜色的一个包
https://www.npmjs.com/package/colors
3.开始编码啦
新建xxx.js文件内容:
// 引入需要的包
var
http = null,
fs = require('fs'),
cheerio = require('cheerio'),
parse = require('url').parse,
async = require('async')
require('colors')
// 接收抓取页面的URL 如果不指定默认抓取http://jeffjade.com/2016/03/30/104-front-end-tutorial/上的所有链接
var
TARGET_PATH = process.argv[2] || 'http://jeffjade.com/2016/03/30/104-front-end-tutorial/',
TIMEOUT_VALUE = 45000 // 指定超时的时间45秒
// 入口方法:调用main方法传入指定页面的地址
mian(TARGET_PATH)
function mian (targetUrl) {
var info = parse(targetUrl)
http = info.protocol.indexOf('https') > -1 ? require('https') : require('http')
console.log('>> Start crawling all links ...'.green)
// 调用download开始下载targetUrl页面啦, function (data) :指定下载页面成功后的回调
download(targetUrl, function (data) {
if (data) {
console.log('Well done! Grab all the links work has been completed!'.green)
// 引入我们要解析的html
var $ = cheerio.load(data),
saveGrabbingLinkArr = []
// 找到所有的a标签
$('body a').each(function (i, e) {
var aTagsVal = $(e).attr('href')
if (!!aTagsVal) {
let _aTagsVal = ''
if (aTagsVal.indexOf('http://') === 0 || aTagsVal.indexOf('https://') === 0) {
_aTagsVal = aTagsVal
} else if (aTagsVal.indexOf('/') > -1) {
_aTagsVal = info.protocol + '//' + info.host + aTagsVal
} else {
_aTagsVal = ''
}
_aTagsVal && !saveGrabbingLinkArr.includes(_aTagsVal) && saveGrabbingLinkArr.push(_aTagsVal)
}
})
console.log('>> Start handle these links(Eg: Duplicate removal,Make the path complete) ...'.cyan)
console.log('>> Start analyzing the effectiveness of all links ...'.green)
// 过滤链接 参数:页面上的所有链接
filterInvalidLinks(saveGrabbingLinkArr)
}
})
}
// // 根据指定URL下载页面,回调函数返回下载页面(string类型)
function download (url, callback) {
http.get(url, function (res) {
var data = ''
res.on('data', function (chunk) {
data += chunk
})
res.on('end', function () {
callback(data)
})
}).on('error', function (err) {
console.log('Opps, Download Error Occurred !'.red)
console.log(err)
})
}
// 以异步的方式遍历检查传入的所有链接
function filterInvalidLinks (needFilterList) {
async.map(needFilterList, function(item, _callback) {
// 具体检查实现
requestUrl(item, _callback)
}, function(err, results) {
callback(err, results)
})
}
// 以GET方法请求URL成功后调用CallBack
function requestUrl (url, callback) {
var info = parse(url),
path = info.pathname + (info.search || ''),
options = {
host: info.hostname,
port: info.port || 80,
path: path,
method: 'GET'
},
req = null,
request_timeout = null
request_timeout = setTimeout(function () {
request_timeout = null
req.abort()
callback(new Error('Request timeout'), url)
}, TIMEOUT_VALUE)
req = http.request(options, function (res) {
clearTimeout(request_timeout)
var chunks = [],
length = 0
res.on('data', function (chunk) {
length += chunk.length
chunks.push(chunk)
}).on('end', function () {
var data = new Buffer(length)
for (var i = 0, pos = 0, l = chunks.length; i < l; i++) {
chunks[i].copy(data, pos)
pos += chunks[i].length
}
res.body = data
callback(null, 'normal-link')
}).on('error', function (err) {
callback(err, url)
})
}).on('error', function (err) {
// node0.5.x及以上,调用req.abort()会触发一次“socket hang up” error;
// 所以需要判断是否超时,如果是超时,则无需再回调异常结果
if (request_timeout) {
clearTimeout(request_timeout)
callback(err, url)
}
})
req.end()
}
function callback (err, errUrlArr) {
if (errUrlArr.length <= 0) {
console.log('Nice, All links in the page are accessible. '.green)
return
}
console.log('These are invalid links (Maybe for you to shield):'.yellow)
// 以红色打印非有效链接到控制台
var invalidUrlList = errUrlArr.filter(item => {
if (item !== "normal-link") {
console.log(item.red)
return item
}
})
outPrint(invalidUrlList)
}
// 输出非有效链接到err_url_list.json
function outPrint (resData) {
var filepath = './err_url_list.json'
var resJson = JSON.stringify(resData, null, 2)
fs.writeFile(filepath, resJson, function (e) {
if (e) throw e
})
}
4.使用
node xxx.js http://jeffjade.com/
源码地址:https://github.com/nicejade/nice-jade-collecting/blob/master/scripts/crawl_page-invalid_link.js