爬取整个网页
# 新建爬虫文件并编辑
[root@bogon http]# vi crawler.js
# 文件内容
var http = require('http')
var url = 'http://zhangdanyang.com/'
http.get(url,function(res){
var html = ''
res.on('data',function(data){
html+=data
})
res.on('end',function(){
console.log(html)
})
}).on('error',function(){
console.log('获取数据失败!')
})
# 运行结果是打印页面html代码
<!DOCTYPE html>
<head>
<meta charset="UTF-8">
<title>...</title>
</head>
<body>
...
</body>
</html>
爬取慕课网课程实例
http://www.imooc.com/learn/348
【代码】
var http = require('http')
var cheerio = require('cheerio')
var url = 'http://www.imooc.com/learn/348'
function filterChapters(html){
var $ = cheerio.load(html)
var chapters = $('.chapter')
//数据结构
//[{
// chapterTitle:'',
// videos:[
// title:'',
// id:''
// ]
//}]
var courseData = []
chapters.each(function(item){
var chapter = $(this)
//var chapterTitle = chapter.find('strong').text()
var chapterTitle = chapter.find('strong').contents().filter(function() { return this.nodeType === 3; }).text().trim();
//var videos = chapter.find('.video').children('li')
var videos=chapter.find('ul').children()
var chapterData = {
chapterTitle:chapterTitle,
videos:[]
}
videos.each(function(item){
//var video = $(this).find('.studyvideo')
var video=$(this).find('a')
//var videoTitle = video.text()
var temp=video.text().trim()
var arr = temp.split('\n') // 多层标签的文本都拼到一起了,要拆开,取用需要的值
var videoTitle = arr[0].trim() + ' ' +arr[1].trim()
var id = video.attr('href').split('video/')[1]
chapterData.videos.push({
title:videoTitle,
id:id
})
})
courseData.push(chapterData)
})
return courseData
}
function printCourseInfo(courseData){
//console.log("printInfo")
courseData.forEach(function(item){
var chapterTitle = item.chapterTitle
console.log(chapterTitle+'\n')
item.videos.forEach(function(video){
console.log('【'+video.id+'】'+video.title+'\n')
})
})
}
http.get(url,function(res){
var html = ''
res.on('data',function(data){
html+=data
})
res.on('end',function(){
//console.log(html)
var courseData = filterChapters(html)
printCourseInfo(courseData)
})
}).on('error',function(){
console.log('获取数据失败!')
})
【结果】
[root@bogon http]# node crawler
第1章 前言
【6687】1-1 Node.js基础-前言 (01:20)
……(省略掉中间内容)
【8837】5-12 Node.js:request方法 (17:56)