准备
使用的库:
- superagent (需要安装类型文件
@types/superagent) npm install superagent @types/superagent -D- cheerio 向jquery一样来解析html
npm install cheerio @types/cheerio -D
开始
使用superagent爬取html
async getRawHtml(){
const result = await superagent.get(this.url)
console.log('result test',result.text)
}
使用cheerio 对html内容进行解析
cheerio遵从jquery的语法
- eq:表示第几个元素
getCourseInfo(html:string){
const courseInfos:Course[] = []
const $ = cheerio.load(html)
$('.content').find('.course-item').map((index,element) =>{
const desc = $(element).find('.course-desc')
const name = desc.eq(0).text()
const count = parseInt(desc.eq(1).text().split(':')[1])
courseInfos.push({
name,
count
})
})
console.log('result',courseInfos)
}
将爬取的内容写进json文件
- 使用fs读写文件
- 对Object类型的key,value定义interface
interface Content{
[propName:number]:Course[]
}
generateJsonFile(courseResult:courseResult){
// write to json file
let fileContent:Content = {}
const filePath = path.resolve(__dirname,'../data/course.json')
if(fs.existsSync(filePath)){
fileContent = JSON.parse(fs.readFileSync(filePath,'utf-8'))
}
fileContent[courseResult.time] = courseResult.data
fs.writeFileSync(filePath,JSON.stringify(fileContent,null,2),'utf-8')
}
最后整理一下代码,将度文件与希望文件的行为分开。
使用组合模式优化代码
将独有的逻辑抽离
将分析html并生成文件的的代码片段抽离出去,放在一个单独的类里面,并且调用
- crowller 只负责读取/写
- analyzer 只负责分析
// class crowller
constructor(private analyzer:any){
this.initSpiderProcess()
}
//index
const analyzer = new Analyzer()
const crowller = new Crowller(analyzer)
将具体的analyzer的逻辑都放在analyze这个方法之中,并对analyzer这个类以及独有的analyze这个方法定义一个接口。
// crowller.ts
async initSpiderProcess(){
const html = await this.getRawHtml()
const fileContent = this.analyzer.analyze(html,this.filePath)
this.writeFile(fileContent)
}
constructor(private url:string,private analyzer:Analyzer){
this.initSpiderProcess()
}
//analyzer.ts
public analyze(html:string,filePath:string){
const courseInfo = this.getCourseInfo(html)
const fileContent = this.generateJsonFile(courseInfo,filePath)
return JSON.stringify(fileContent,null,2)
}
//不同的分析器只要重写这一个方法就可以了
export default class Web1Analyzer implements Analyzer{
public analyze(html:string,filePath:string){
const courseInfo = html;
return courseInfo
}
}
最后附上所有相关代码:
// analyzer.ts
import cheerio from 'cheerio'
import fs from 'fs'
import path from 'path'
import {Analyzer} from './crowller'
interface Course{
name:string,
count:number
}
interface courseResult{
time:number,
data:Course[]
}
interface Content{
[propName:number]:Course[]
}
export default class Web1Analyzer implements Analyzer{
private getCourseInfo(html:string){
const courseInfos:Course[] = []
const $ = cheerio.load(html)
$('.content').find('.course-item').map((index,element) =>{
const desc = $(element).find('.course-desc')
const name = desc.eq(0).text()
const count = parseInt(desc.eq(1).text().split(':')[1])
courseInfos.push({
name,
count
})
})
const result = {
time:new Date().getTime(),
data:courseInfos
}
return result
}
private generateJsonFile(courseResult:courseResult,filePath:string){
// write to json file
let fileContent:Content = {}
if(fs.existsSync(filePath)){
fileContent = JSON.parse(fs.readFileSync(filePath,'utf-8'))
}
fileContent[courseResult.time] = courseResult.data
return fileContent
}
public analyze(html:string,filePath:string){
const courseInfo = this.getCourseInfo(html)
const fileContent = this.generateJsonFile(courseInfo,filePath)
return JSON.stringify(fileContent,null,2)
}
}
// crowller
import superagent from 'superagent'
import cheerio from 'cheerio'
import fs from 'fs'
import path from 'path'
import Web1Analyzer from './analyzer'
import analyzerB from './analyzerB'
interface Course{
name:string,
count:number
}
interface courseResult{
time:number,
data:Course[]
}
interface Content{
[propName:number]:Course[]
}
export interface Analyzer{
analyze:(html:string,filePath:string) => string;
}
class Crowller{
private filePath = path.resolve(__dirname,'../data/course.json')
async getRawHtml(){
const result = await superagent.get(this.url)
return result.text
}
writeFile(fileContent:string){
fs.writeFileSync(this.filePath,fileContent,'utf-8')
}
async initSpiderProcess(){
const html = await this.getRawHtml()
const fileContent = this.analyzer.analyze(html,this.filePath)
this.writeFile(fileContent)
}
constructor(private url:string,private analyzer:Analyzer){
this.initSpiderProcess()
}
}
const secret = 'x3b174jsx'
const url = `http://www.dell-lee.com/typescript/demo.html?secret=${secret}`
const analyzer = new analyzerB()
new Crowller(url,analyzer)
// superagent js ts --> js
//ts -> .d.ts 翻译文件 @types-> js