puppeteer抓取动态网页demo

1、puppeteer 配置反爬策略,抓取动态网页内容
2、本demo需使用npm安装kafka-node与puppeteer,mysql可选和去除相关读取任务和存取结果的功能

npm install kafka-node
nom install puppeteer

3、对比使用过其他多种模拟浏览器方式,没有成功
4、使用kafka接收任务和发送抓取结果
5、参考文章:https://cloud.tencent.com/developer/article/1751469?from=article.detail.1792297

/**

  • -----程序名称:小虫子------
  • @author lxy.

*接收kafka消息
*抓取网页
*发送结果

*/
const puppeteer = require('puppeteer');
var kafka = require('kafka-node'),
Producer = kafka.Producer,
Client = kafka.KafkaClient;
const ConsumerGroup = kafka.ConsumerGroup;

var zkUrl = '*****';//需修改的zk地址
const client = new kafka.KafkaClient({ kafkaHost: zkUrl });
producer = new Producer(client);
setupProducer(producer);

//输入topic
var INPUT_TOPIC = 'input_topic';
//输出topic
var OUTPUT_TOPIC = 'output_topic';

//程序入口
function start() {
var options = {
kafkaHost: zkUrl,
groupId: 'nodejs-crawl',
sessionTimeout: 30000,
autoCommit: true,
};
var consumer = new ConsumerGroup(Object.assign({ id: 'c1' }, options), INPUT_TOPIC);
consumer.on('message', msg => {
doTask(JSON.parse(msg.value));
});
}

/**

  • 执行抓取任务

  • @param {任务信息} task
    */
    function doTask(task) {
    if (!task) {
    return;
    }
    console.log("doTask :", task);
    puppeteer.launch({
    headless: false, //不使用无头模式使用本地可视化
    //executablePath: "./Chromium/chrome-win/chrome.exe", //因为是yarn add puppeteer --ignore-scripts没有安装chromium,需要制定本地chromium的chrome.exe路径所在,刚才下载后解压后的全路径
    //设置超时时间
    timeout: 15000,
    //如果是访问https页面 此属性会忽略https错误
    ignoreHTTPSErrors: true,
    // 打开开发者工具, 当此值为true时, headless总为false
    devtools: false,
    }).then(async browser => {
    const page = await browser.newPage()
    //设置page请求信息
    pageSetting(page);
    let taskId = task.taskId;
    let url = task.url;
    //模拟访问页面
    await page.goto(url, { waitUntil: "networkidle2" })
    await page.waitForSelector('html');
    let bodyInnerHTML = await page.evaluate(() => {
    return document.body.innerHTML;
    });
    console.log("page load succ");
    sendResult(taskId, bodyInnerHTML)
    console.log("sendResult succ");
    await browser.close()
    console.log("job finish,name:", taskId);

    }).catch(err => console.log(err))
    }

/**

  • 发送结果
    */
    function sendResult(taskId, bodyInnerHTML) {
    //保存结果
    var message = [
    { 'taskId': taskId, 'content': bodyInnerHTML },
    ];
    producer.send([{
    topic: OUTPUT_TOPIC,
    messages: message
    }], function (err, result) {
    console.log("result:", result);
    if (err) {
    console.err(common log kafka send message failed : ${err}, result is ${result});
    return;
    } else {
    console.log(kafka send message succeed ${taskId});
    return;
    }
    });
    }

function setupProducer(producer) {
producer.on('error', function (err) {
console.log("setupProducer error.")
return;
});
producer.on('ready', function () {
console.info('setupProducer succ');
return;
});
}

/**

  • 设置page参数,逼真模拟真实浏览器
    */
    async function pageSetting(page) {
    // webdriver
    await page.evaluateOnNewDocument(() => {
    const newProto = navigator.proto;
    delete newProto.webdriver; //删除 navigator.webdriver字段
    navigator.proto = newProto;
    });
    // 添加 window.chrome字段,向内部填充一些值
    await page.evaluateOnNewDocument(() => {
    window.chrome = {};
    window.chrome.app = {
    InstallState: 'hehe',
    RunningState: 'haha',
    getDetails: 'xixi',
    getIsInstalled: 'ohno',
    };
    window.chrome.csi = function () { };
    window.chrome.loadTimes = function () { };
    window.chrome.runtime = function () { };
    });
    // userAgent设置
    await page.evaluateOnNewDocument(() => {
    Object.defineProperty(navigator, 'userAgent', {
    //userAgent在无头模式下有headless字样,所以需覆盖
    get: () =>
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
    });
    });
    // plugins设置
    await page.evaluateOnNewDocument(() => {
    Object.defineProperty(navigator, 'plugins', {
    //伪装真实的插件信息
    get: () => [
    {
    0: {
    type: 'application/x-google-chrome-pdf',
    suffixes: 'pdf',
    description: 'Portable Document Format',
    enabledPlugin: Plugin,
    },
    description: 'Portable Document Format',
    filename: 'internal-pdf-viewer',
    length: 1,
    name: 'Chrome PDF Plugin',
    },
    {
    0: {
    type: 'application/pdf',
    suffixes: 'pdf',
    description: '',
    enabledPlugin: Plugin,
    },
    description: '',
    filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai',
    length: 1,
    name: 'Chrome PDF Viewer',
    },
    {
    0: {
    type: 'application/x-nacl',
    suffixes: '',
    description: 'Native Client Executable',
    enabledPlugin: Plugin,
    },
    1: {
    type: 'application/x-pnacl',
    suffixes: '',
    description: 'Portable Native Client Executable',
    enabledPlugin: Plugin,
    },
    description: '',
    filename: 'internal-nacl-plugin',
    length: 2,
    name: 'Native Client',
    },
    ],
    });
    });
    // languages设置
    await page.evaluateOnNewDocument(() => {
    Object.defineProperty(navigator, 'languages', {
    //添加语言
    get: () => ['zh-CN', 'zh', 'en'],
    });
    });
    // permissions设置
    await page.evaluateOnNewDocument(() => {
    const originalQuery = window.navigator.permissions.query; //notification伪装
    window.navigator.permissions.query = (parameters) =>
    parameters.name === 'notifications'
    ? Promise.resolve({ state: Notification.permission })
    : originalQuery(parameters);
    });
    // WebGL设置
    await page.evaluateOnNewDocument(() => {
    const getParameter = WebGLRenderingContext.getParameter;
    WebGLRenderingContext.prototype.getParameter = function (parameter) {
    // UNMASKED_VENDOR_WEBGL
    if (parameter === 37445) {
    return 'Intel Inc.';
    }
    // UNMASKED_RENDERER_WEBGL
    if (parameter === 37446) {
    return 'Intel(R) Iris(TM) Graphics 6100';
    }
    return getParameter(parameter);
    };
    });
    }

start();

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容