目标
抓取微博热门数据,滚动分页;
工具
puppeteer cheerio
方案
通过 puppeteer 模拟浏览器渲染网页,通过 cheerio 获取dom节点,抓取数据;
直接上代码
import cheerio from "cheerio";
import chalk from "chalk"; // 一个美化 console 输出的库
import fs from "fs";
import crypto from "crypto";
import puppeteer from "puppeteer";
const log = console.log; // 缩写 console.log
interface listType {
id: string;
time: string;
from: string;
description: string;
imgList: string[];
forward: string;
discuss: string;
fabulous: string;
}
class Reptile {
// https://weibo.com/u/5587951849
// https://weibo.com/newlogin?tabtype=list&gid=1028039999&url=https%3A%2F%2Fweibo.com%2F
private url =
"https://weibo.com/newlogin?tabtype=list&gid=1028039999&url=https%3A%2F%2Fweibo.com%2F";
async getHtml() {
// 首先通过Puppeteer启动一个浏览器环境
const browser = await puppeteer.launch({
headless: true, // 值为false会发开浏览器
});
// Create a new page
const page = await browser.newPage();
// 设置渲染尺寸
page.setViewport({
width: 1200,
height: 900,
deviceScaleFactor: 1,
});
// Configure the navigation timeout
await page.setDefaultNavigationTimeout(0);
// 加载网页
await page.goto(this.url);
// 等待网页完全加载
await page.reload();
await page.waitForNavigation();
log(chalk.yellow("页面初次加载完毕"));
let num = 0;
let getLen: number[] = [];
let data: listType[] = [];
// 滚动翻页加载数据
const loadData = async () => {
log(chalk.blue(`第${num}次爬取,当前获取数据${data.length}条`));
num++;
// page.click(".navbtmbox");
await page.evaluate((num: number) => {
window.scrollTo(0, num * 900);
}, num);
const content = await page.content();
// 通过cheerio获取页面元素
const $ = cheerio.load(content);
const list = $(".vue-recycle-scroller__item-wrapper").find(
".vue-recycle-scroller__item-view"
);
list.map((i, el) => {
const item = $(el).find(".woo-panel-main");
let arr: string[] = [];
item
.find(".woo-box-wrap")
.find(".woo-picture-img")
.map((j, img) => {
arr.push($(img).attr("src") as string);
});
const id = crypto
.createHash("md5")
.update(
`${item.find(".head-info_time_6sFQg").text()}${item
.find(".head-info_cut_1tPQI")
.text()}${item.find(".toolbar_num_JXZul").text()}`
)
.digest("hex");
let size = 0;
for (let m in data) {
if (data[m].id === id) {
size++;
}
}
if (size === 0) {
data.push({
id: id,
time: item.find(".head-info_time_6sFQg").text(),
from: item.find(".head-info_cut_1tPQI").text(),
description: item.find(".detail_wbtext_4CRf9").text(),
imgList: arr,
forward: item.find(".toolbar_num_JXZul").text(),
discuss: item.find(".toolbar_num_JXZul").text(),
fabulous: item.find(".woo-like-count").text(),
});
}
});
getLen.push(data.length);
if (
(getLen.length > 50 &&
getLen[getLen.length - 1] == getLen[getLen.length - 50]) ||
data.length > 1000
) {
fs.writeFile("./src/index.html", content, "utf8", async (error) => {
if (error) {
console.log(error);
}
log(chalk.green(`dom写入成功`));
});
fs.writeFile(
"./src/data.json",
JSON.stringify(data),
"utf8",
async (error) => {
if (error) {
console.log(error);
}
log(
chalk.green(
`爬取数据${data.length}条,共计用时${
(num * 200) / 1000
}s,写入成功`
)
);
page.close();
browser.close();
}
);
} else {
setTimeout(async () => {
await loadData();
}, 200);
}
};
await loadData();
}
constructor() {
this.getHtml();
}
}
new Reptile();
创建一个ts文件,如 index.ts 然后直接去运行就可以了;
抓取到的数据;
学习使用,如有侵权,请联系作者随时删改;