puppeteer 爬虫模板

import * as puppeteer from 'puppeteer-core';

(async () => {
    const browser = await puppeteer.launch({
        executablePath: '/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge',
        args: ['--start-maximized', '--disable-extensions', '--disable-infobars', '--disable-web-security', '--disable-features=IsolateOrigins,site-per-process'],
        headless: true,
    });
    browser.newPage().then(async (page) => {
        await page.setViewport({width: 1920, height: 1080});
        await page.goto('http://www.petsworld.cn/Html/zhzx/guonei/2023/3/6376401601033.html', {waitUntil: 'networkidle2'});
        const xpath = "/html/body/table[2]/tbody/tr/td[1]/table/tbody/tr[2]/td/table/tbody/tr[4]/td/table[1]/tbody/tr[1]";
        const images = await getImages(page, xpath);
        console.log(images)

        const html = await getHtml(page, xpath);
        console.log(html)

        await page.close()
        await browser.close();
    });
})();

const getHtml = (page, xpath) => {
    return new Promise((resolve, reject) => {
        page.$x(xpath).then(async (element) => {
            //将image的url变为绝对路径
            await page.evaluate(element => {
                const images = element.querySelectorAll('img');
                for (const image of images) {
                    image.src = new URL(image.src, window.location.href).href;
                }
            }, element[0]);

            // return html
            let html = await page.evaluate(element => element.innerHTML, element[0]);
            html = clean_html(html)
            resolve(html);
        }).catch((err) => {
            reject(err);
        })
    })
}

const getImages = (page, xpath) => {
    return new Promise((resolve, reject) => {
        page.$x(xpath).then(async (element) => {
            const images = await page.evaluate(element => {
                const images = element.querySelectorAll('img');
                for (const image of images) {
                    image.src = new URL(image.src, window.location.href).href;
                }
                return Array.from(images).map(image => image.src);
            }, element[0]);
            resolve(images);
        }).catch((err) => {
            reject(err);
        })
    })
}


function clean_property(html) {
    const properties = [
        'style', 'class', 'size', 'face', 'lang', 'width', 'height', 'valign', 'align', 'border', 'cellpadding',
        'cellspacing', 'onload', 'onunload', 'onclick', 'ondblclick', 'onmousedown', 'onmouseup',
        'onmouseover', 'onmousemove', 'onmouseout', 'onkeypress', 'onkeydown', 'onkeyup', 'onfocus',
        'onblur', 'onselect', 'onchange', 'onsubmit', 'onreset', 'onselectstart', 'onerrorupdate',
        'onhelp', 'onrowexit', 'onrowenter', 'onbeforeunload', 'onbeforeupdate', 'onafterupdate',
        'oncellchange', 'oncontextmenu', 'ondataavailable', 'ondatasetchanged', 'ondatasetcomplete',
        'onmousewheel', 'onreadystatechange', 'onstop', 'onrowsdelete', 'onrowsinserted', 'onbeforeeditfocus',
    ];
    properties.forEach((property) => {
        const reg = new RegExp(property + '="[^"]*"', "g");
        html = html.replace(reg, "");
    })
    return html;
}

function clean_tags(html) {
    const tags = ['script', 'style', 'iframe', 'link', 'meta', 'noscript', 'object', 'param', 'applet', 'embed'];
    tags.forEach((tag) => {
        const reg = new RegExp('<' + tag + '.*?>.*?</' + tag + '>', "g");
        html = html.replace(reg, "");
    })
    //
    const otherTags = ['font', 'span', 'o:p'];
    otherTags.forEach((tag) => {
        const regStart = new RegExp('<' + tag + '.*?>', "g");
        html = html.replace(regStart, "");
        const regEnd = new RegExp('</' + tag + '.*?>', "g");
        html = html.replace(regEnd, "");
    })
    return html;
}

function clean_html(html) {
    html = clean_property(html);
    html = clean_tags(html);
    //清除注释
    html = html.replace(/<!--.*?-->/g, "");
    //清除空格和空行
    html = html.replace(/&nbsp;/g, "");
    html = html.replace(/\s{2}/g, "");
    html = html.replace(/\r/g, "");
    html = html.replace(/\n/g, "");
    //删除空标签
    html = html.replace(/<(\w+)[^>]*>\s*<\/\1>/g, "");
    html = html_beautify(html);
    return html;
}

function html_beautify(html) {
    // eslint-disable-next-line no-undef
    const beautify = require('js-beautify').html;
    const options = {
        indent_size: 4,
        indent_char: ' ',
        eol: '\r\n',
        indent_level: 0,
        indent_with_tabs: false,
        preserve_newlines: true,
        max_preserve_newlines: 10,
    }
    return beautify(html, options);
}

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容