1、puppeteer 配置反爬策略,抓取动态网页内容
2、本demo需使用npm安装kafka-node与puppeteer,mysql可选和去除相关读取任务和存取结果的功能
npm install kafka-node
nom install puppeteer
3、对比使用过其他多种模拟浏览器方式,没有成功
4、使用kafka接收任务和发送抓取结果
5、参考文章:https://cloud.tencent.com/developer/article/1751469?from=article.detail.1792297
/**
- -----程序名称:小虫子------
- @author lxy.
*接收kafka消息
*抓取网页
*发送结果
*/
const puppeteer = require('puppeteer');
var kafka = require('kafka-node'),
Producer = kafka.Producer,
Client = kafka.KafkaClient;
const ConsumerGroup = kafka.ConsumerGroup;
var zkUrl = '*****';//需修改的zk地址
const client = new kafka.KafkaClient({ kafkaHost: zkUrl });
producer = new Producer(client);
setupProducer(producer);
//输入topic
var INPUT_TOPIC = 'input_topic';
//输出topic
var OUTPUT_TOPIC = 'output_topic';
//程序入口
function start() {
var options = {
kafkaHost: zkUrl,
groupId: 'nodejs-crawl',
sessionTimeout: 30000,
autoCommit: true,
};
var consumer = new ConsumerGroup(Object.assign({ id: 'c1' }, options), INPUT_TOPIC);
consumer.on('message', msg => {
doTask(JSON.parse(msg.value));
});
}
/**
执行抓取任务
-
@param {任务信息} task
*/
function doTask(task) {
if (!task) {
return;
}
console.log("doTask :", task);
puppeteer.launch({
headless: false, //不使用无头模式使用本地可视化
//executablePath: "./Chromium/chrome-win/chrome.exe", //因为是yarn add puppeteer --ignore-scripts没有安装chromium,需要制定本地chromium的chrome.exe路径所在,刚才下载后解压后的全路径
//设置超时时间
timeout: 15000,
//如果是访问https页面 此属性会忽略https错误
ignoreHTTPSErrors: true,
// 打开开发者工具, 当此值为true时, headless总为false
devtools: false,
}).then(async browser => {
const page = await browser.newPage()
//设置page请求信息
pageSetting(page);
let taskId = task.taskId;
let url = task.url;
//模拟访问页面
await page.goto(url, { waitUntil: "networkidle2" })
await page.waitForSelector('html');
let bodyInnerHTML = await page.evaluate(() => {
return document.body.innerHTML;
});
console.log("page load succ");
sendResult(taskId, bodyInnerHTML)
console.log("sendResult succ");
await browser.close()
console.log("job finish,name:", taskId);}).catch(err => console.log(err))
}
/**
- 发送结果
*/
function sendResult(taskId, bodyInnerHTML) {
//保存结果
var message = [
{ 'taskId': taskId, 'content': bodyInnerHTML },
];
producer.send([{
topic: OUTPUT_TOPIC,
messages: message
}], function (err, result) {
console.log("result:", result);
if (err) {
console.err(common log kafka send message failed : ${err}, result is ${result});
return;
} else {
console.log(kafka send message succeed ${taskId});
return;
}
});
}
function setupProducer(producer) {
producer.on('error', function (err) {
console.log("setupProducer error.")
return;
});
producer.on('ready', function () {
console.info('setupProducer succ');
return;
});
}
/**
- 设置page参数,逼真模拟真实浏览器
*/
async function pageSetting(page) {
// webdriver
await page.evaluateOnNewDocument(() => {
const newProto = navigator.proto;
delete newProto.webdriver; //删除 navigator.webdriver字段
navigator.proto = newProto;
});
// 添加 window.chrome字段,向内部填充一些值
await page.evaluateOnNewDocument(() => {
window.chrome = {};
window.chrome.app = {
InstallState: 'hehe',
RunningState: 'haha',
getDetails: 'xixi',
getIsInstalled: 'ohno',
};
window.chrome.csi = function () { };
window.chrome.loadTimes = function () { };
window.chrome.runtime = function () { };
});
// userAgent设置
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'userAgent', {
//userAgent在无头模式下有headless字样,所以需覆盖
get: () =>
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
});
});
// plugins设置
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'plugins', {
//伪装真实的插件信息
get: () => [
{
0: {
type: 'application/x-google-chrome-pdf',
suffixes: 'pdf',
description: 'Portable Document Format',
enabledPlugin: Plugin,
},
description: 'Portable Document Format',
filename: 'internal-pdf-viewer',
length: 1,
name: 'Chrome PDF Plugin',
},
{
0: {
type: 'application/pdf',
suffixes: 'pdf',
description: '',
enabledPlugin: Plugin,
},
description: '',
filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
length: 1,
name: 'Chrome PDF Viewer',
},
{
0: {
type: 'application/x-nacl',
suffixes: '',
description: 'Native Client Executable',
enabledPlugin: Plugin,
},
1: {
type: 'application/x-pnacl',
suffixes: '',
description: 'Portable Native Client Executable',
enabledPlugin: Plugin,
},
description: '',
filename: 'internal-nacl-plugin',
length: 2,
name: 'Native Client',
},
],
});
});
// languages设置
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'languages', {
//添加语言
get: () => ['zh-CN', 'zh', 'en'],
});
});
// permissions设置
await page.evaluateOnNewDocument(() => {
const originalQuery = window.navigator.permissions.query; //notification伪装
window.navigator.permissions.query = (parameters) =>
parameters.name === 'notifications'
? Promise.resolve({ state: Notification.permission })
: originalQuery(parameters);
});
// WebGL设置
await page.evaluateOnNewDocument(() => {
const getParameter = WebGLRenderingContext.getParameter;
WebGLRenderingContext.prototype.getParameter = function (parameter) {
// UNMASKED_VENDOR_WEBGL
if (parameter === 37445) {
return 'Intel Inc.';
}
// UNMASKED_RENDERER_WEBGL
if (parameter === 37446) {
return 'Intel(R) Iris(TM) Graphics 6100';
}
return getParameter(parameter);
};
});
}
start();