临时记录,通用列表归类抽取

chrome浏览器中网页文字列表类数据,通用抽取方案。目的是自动识别可以抽取的文字类列表数据,用于爬虫自动生成xpath脚本。
半成品记录

// xpath获取元素
function $x(xpath, context) {
    context = context || document;
    const result = [];
    const nodesSnapshot = document.evaluate(xpath, context, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
    for (let i = 0; i < nodesSnapshot.snapshotLength; i++) {
        result.push(nodesSnapshot.snapshotItem(i));
    }
    return result;
}

function getXPath(element) {
    if (element === document) {
        return '/html';
    }

    var path = '';
    for (; element && element.nodeType === Node.ELEMENT_NODE; element = element.parentNode) {
        var tagName = element.nodeName.toLowerCase();
        var index = 0;
        var hasFollowingSiblings = false;

        // Determine the index of the current element among its siblings with the same tag name
        for (var sibling = element.previousSibling; sibling; sibling = sibling.previousSibling) {
            if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName.toLowerCase() === tagName) {
                index++;
            }
        }

        // Check if there are following siblings with the same tag name
        for (var sibling = element.nextSibling; sibling && !hasFollowingSiblings; sibling = sibling.nextSibling) {
            if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName.toLowerCase() === tagName) {
                hasFollowingSiblings = true;
            }
        }

        // If there are following siblings with the same tag name, we include the index in the XPath
        var indexPart = (hasFollowingSiblings ? '[' + (index + 1) + ']' : '');

        // Prepend the current element's path
        path = '/' + tagName + indexPart + path;
    }

    return path;
}

// 获取xpath最短路径
function getShortXPath(element) {
    if (!element || element.nodeType !== 1) {
        return "";
    }
    let path = "";
    let current = element;
    while (current.parentNode) {
        const siblings = current.parentNode.childNodes;
        const index = Array.from(siblings).indexOf(current);
        if (index !== -1) {
            path = `/${current.tagName.toLowerCase()}` + path;
        }
        current = current.parentNode;
    }
    return path;
}

// 获取页面中所有有文本内容的元素
function extractSimilarRegions() {
    // 使用XPath查询所有包含文本的元素
    const elementsWithText = $x("//text()[normalize-space()!='']/..").filter(element => {
        return !/html|body|script|style|title|head/.test(element.tagName.toLowerCase());
    });

    // 归类元素
    const groupedElements = elementsWithText.reduce((acc, element) => {
        const shortXPath = getXPath(element).replace(/\[\d+\](?!.*\[\d+\])/g,"[*]").replace(/\[\d+\](?!.*\[\d+\])/g,"[*]");
        //.replace(/\[\d+\](?!.*\[\d+\])/,"[*]");
        //const shortXPath = getShortXPath(element);
        const text = element.textContent.trim();
        const parentXPath = shortXPath.substring(0, shortXPath.lastIndexOf('/'));
        
        // 按照shortPath路径长度、元素一致、层级一致进行归类
        if (!acc[parentXPath]) {
            acc[parentXPath] = [];
        }
        acc[parentXPath].push({ element, shortXPath, text});
        return acc;
    }, {});

    // 转换为二维数组,并按照跟父亲元素最近距离排序
    const groupedSimilarRegions = Object.keys(groupedElements).map(parentXPath => {
        return groupedElements[parentXPath].sort((a, b) => a.shortXPath.length - b.shortXPath.length);
    }).filter(group => group.length >= 3);

    return groupedSimilarRegions;
}

// 使用extractSimilarRegions函数
const similarRegions = extractSimilarRegions();
console.log(similarRegions);

原始

function $x(xpath, context) {
    context = context || document;
    const result = [];
    const nodesSnapshot = document.evaluate(xpath, context, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
    for (let i = 0; i < nodesSnapshot.snapshotLength; i++) {
        result.push(nodesSnapshot.snapshotItem(i));
    }
    return result;
}

function getXPath(element) {
    if (element === document) {
        return '/html';
    }

    var path = '';
    for (; element && element.nodeType === Node.ELEMENT_NODE; element = element.parentNode) {
        var tagName = element.nodeName.toLowerCase();
        var index = 0;
        var hasFollowingSiblings = false;

        // Determine the index of the current element among its siblings with the same tag name
        for (var sibling = element.previousSibling; sibling; sibling = sibling.previousSibling) {
            if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName.toLowerCase() === tagName) {
                index++;
            }
        }

        // Check if there are following siblings with the same tag name
        for (var sibling = element.nextSibling; sibling && !hasFollowingSiblings; sibling = sibling.nextSibling) {
            if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName.toLowerCase() === tagName) {
                hasFollowingSiblings = true;
            }
        }

        // If there are following siblings with the same tag name, we include the index in the XPath
        var indexPart = (hasFollowingSiblings ? '[' + (index + 1) + ']' : '');

        // Prepend the current element's path
        path = '/' + tagName + indexPart + path;
    }

    return path;
}

function createTemplateWithWildcard(strings) {
    if (strings.length === 0) return '';

    // 使用第一个字符串作为模板的起始点,并分割成数组
    let templateParts = strings[0].split('/');

    // 遍历模板的每个部分,检查数字是否在所有字符串中相同
    templateParts = templateParts.map((part, index) => {
        // 如果部分包含数字,检查所有字符串的相应部分
        if (/\[\d+\]/.test(part)) {
            for (let str of strings) {
                let parts = str.split('/');
                // 如果当前数字在任意字符串中不同,替换为 '*'
                if (part !== parts[index]) {
                    return part.replace(/\[\d+\]/, '[*]');
                }
            }
        }
        return part;
    });

    // 将处理后的模板部分重新组合成一个字符串
    return templateParts.join('/');
}

// 页面所有的文本元素查询归类。将相同层级机构的元素归类到一起
function extractSimilarRegions() {
    // 使用XPath查询所有包含文本的元素
    const elementsWithText = $x("//text()[normalize-space()!='']/..").filter(element => {
        return !/html|body|script|style|title|head/.test(element.tagName.toLowerCase());
    });

    // 归类元素
    const groupedElements = elementsWithText.reduce((acc, element) => {
        const xPath = getXPath(element);
        const shortXPath = xPath.substring(0, xPath.lastIndexOf('/') + 1).replace(/(\[\d+\])/g, "[*]") + xPath.substring(xPath.lastIndexOf('/'));
        const text = element.textContent.trim();

        // 按照shortPath路径长度、元素一致、层级一致进行归类
        if (!acc[shortXPath]) {
            acc[shortXPath] = [];
        }
        acc[shortXPath].push({
            element,
            xPath,
            text,
        });
        return acc;
    }, {});

    // 转换为二维数组,并按照跟父亲元素最近距离排序    
    const groupedSimilarRegions = Object.keys(groupedElements).map(parentXPath => {
        return groupedElements[parentXPath].sort((a, b) => a.xPath.length - b.xPath.length);
    }).filter(group => group.length >= 3);

    // 创建模板并添加到相似区域数组中
    const regionsWithTemplates = groupedSimilarRegions.map(group => {
        const xPaths = group.map(item => item["xPath"]);
        const template = createTemplateWithWildcard(xPaths);
        return {
            elements: group,
            template: template
        };
    });

    return regionsWithTemplates;
}

// 按路径递归,筛选归类父路径相似度一样的xpath
function groupXPathsBySimilarity(xPaths) {
  const groups = {};

  xPaths.forEach(xpath => {
    // 将XPath按'/'分割成数组
    const pathParts = xpath.split('/').filter(part => part);
    
    // 从最深的路径开始寻找公共父路径
    let commonPath = pathParts.join('/');
    while (commonPath) {
      if (!groups[commonPath]) {
        groups[commonPath] = [];
      }
      groups[commonPath].push(xpath);
      // 移除路径的最后一部分,寻找更短的公共路径
      commonPath = commonPath.substring(0, commonPath.lastIndexOf('/'));
    }
  });

  // 筛选出具有多个XPath的组
  const filteredGroups = {};
  Object.keys(groups).forEach(path => {
    if (groups[path].length > 1) {
      filteredGroups[path] = groups[path];
    }
  });

  // 返回分组结果
  return filteredGroups;
}

// 查找页面文本元素,进行列表归类
const similarRegions = extractSimilarRegions();
console.log(similarRegions);

// 获取归类的xpath模板路径
a = similarRegions.map(group => {
    return group["template"];
});
console.log(a);

// 将xpath模板路径,按照父路径进行归类
const groupedXPaths = groupXPathsBySimilarity(a);
console.log(groupedXPaths);

function difference(arr1, arr2) {
  const set2 = new Set(arr2);
  return arr1.filter(x => !set2.has(x));
}

var keys = Object.keys(groupedXPaths).sort();

const groupedXPaths1 = keys.reduce(function(accumulator, currentValue, currentIndex, array) {
    if(currentIndex == 0) return accumulator;
    
    if(groupedXPaths[currentValue].length != groupedXPaths[keys[currentIndex-1]].length){
        accumulator.push(difference(groupedXPaths[keys[currentIndex-1]],groupedXPaths[currentValue]));
    }
    return accumulator;
},[]);

console.log(groupedXPaths1);

xpath通过文本定位

$x("//text()[normalize-space(.) = 'Blackout Python']/../../../preceding-sibling::*[1]").forEach(function(a){setTimeout(function(){a.click()},500)})

$x方法实现

function $x(xpath, contextNode) {
  contextNode = contextNode || document; // 如果没有提供上下文节点,则使用整个文档
  var result = document.evaluate(xpath, contextNode, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
  var nodes = [];
  for (var i = 0; i < result.snapshotLength; i++) {
    nodes.push(result.snapshotItem(i));
  }
  return nodes;
}

$x( "//*[contains(string(),'B') and contains(@*,'htm')]")

非空文字节点

document.evaluate("//text()[normalize-space(.)!='']",document).iterateNext()
var Spider = {
    // 抽取公共路径
    findCommonStart: function (str1, str2) {
        let common = '';
        for (let i = 0; i < str1.length && i < str2.length; i++) {
            if (str1[i] === str2[i]) {
                common += str1[i];
            } else {
                break;
            }
        }
        return common.replace(/\/.*?$/i,"");
    },

    // 余弦向量算法
    cosineSimilarity: function (vecA, vecB) {
        var dotProduct = 0;
        var magnitudeA = 0;
        var magnitudeB = 0;
        for (var key in vecA) {
            dotProduct += vecA[key] * (vecB[key] || 0);
            magnitudeA += vecA[key] * vecA[key];
        }
        for (var key in vecB) {
            magnitudeB += vecB[key] * vecB[key];
        }
        magnitudeA = Math.sqrt(magnitudeA);
        magnitudeB = Math.sqrt(magnitudeB);
        if (magnitudeA === 0 || magnitudeB === 0) {
            return 0;
        }
        return dotProduct / (magnitudeA * magnitudeB);
    },
    // 将字符串转换为向量
    termFrequency: function (str) {
        var terms = str.split('');
        var frequency = {};
        terms.forEach(term => {
            frequency[term] = (frequency[term] || 0) + 1;
        });
        return frequency;
    },

    // 编辑距离算法
    // 计算两个字符串之间的Levenshtein距离
    levenshteinDistance: function (a, b) {
        if (a.length === 0)
            return b.length;
        if (b.length === 0)
            return a.length;
        var matrix = [];
        // 初始化矩阵
        for (var i = 0; i <= b.length; i++) {
            matrix[i] = [i];
        }
        for (var j = 0; j <= a.length; j++) {
            matrix[0][j] = j;
        }
        // 计算编辑距离
        for (var i = 1; i <= b.length; i++) {
            for (var j = 1; j <= a.length; j++) {
                if (b.charAt(i - 1) === a.charAt(j - 1)) {
                    matrix[i][j] = matrix[i - 1][j - 1];
                } else {
                    matrix[i][j] = Math.min(
                        matrix[i - 1][j - 1] + 1, // 替换
                        Math.min(
                            matrix[i][j - 1] + 1, // 插入
                            matrix[i - 1][j] + 1 // 删除
                        ));
                }
            }
        }
        return matrix[b.length][a.length];
    },
    // Damerau-Levenshtein(D-Card)算法
    damerauLevenshteinDistance: function (a, b) {
        var lenA = a.length;
        var lenB = b.length;
        var matrix = new Array(lenB + 1).fill(null).map(() => new Array(lenA + 1).fill(0));
        for (var i = 0; i <= lenA; i++) {
            matrix[0][i] = i;
        }
        for (var j = 0; j <= lenB; j++) {
            matrix[j][0] = j;
        }
        for (var i = 1; i <= lenB; i++) {
            for (var j = 1; j <= lenA; j++) {
                var cost = a[j - 1] === b[i - 1] ? 0 : 1;
                matrix[i][j] = Math.min(
                    matrix[i - 1][j] + 1, // 删除
                    matrix[i][j - 1] + 1, // 插入
                    matrix[i - 1][j - 1] + cost // 替换
                );
                if (i > 1 && j > 1 && a[j - 1] === b[i - 2] && a[j - 2] === b[i - 1]) {
                    matrix[i][j] = Math.min(
                        matrix[i][j],
                        matrix[i - 2][j - 2] + cost // 交换
                    );
                }
            }
        }
        return matrix[lenB][lenA];
    },
    // 计算字符串数组中字符串之间的相似度
    calculateSimilarities: function (strings, algorithm) {
        var sumDistance = 0;
        var maxDistance = 0;
        var minDistance = Infinity;
        var count = 0;
        // 根据选择的算法计算距离
        for (var i = 0; i < strings.length; i++) {
            for (var j = i + 1; j < strings.length; j++) {
                var distance;
                if (algorithm === 'cosine') {
                    distance = 1 - Spider.cosineSimilarity(Spider.termFrequency(strings[i]), Spider.termFrequency(strings[j]));
                } else if (algorithm === 'levenshtein') {
                    distance = Spider.levenshteinDistance(strings[i], strings[j]);
                } else if (algorithm === 'damerau') {
                    distance = Spider.damerauLevenshteinDistance(strings[i], strings[j]);
                }
                sumDistance += distance;
                maxDistance = Math.max(maxDistance, distance);
                minDistance = Math.min(minDistance, distance);
                count++;
            }
        }
        // 计算平均值
        var averageDistance = sumDistance / count;
        return {
            average: averageDistance,
            max: maxDistance,
            min: minDistance
        };
    },

    // 对字符串数组进行归类
    groupStringsBySimilarity: function (strings, threshold, algorithm) {
        var groups = [];

        strings.forEach((str1, index1) => {
            var foundGroup = false;

            groups.forEach((group, index2) => {
                var str2 = group[0];
                var distance;
                if (algorithm === 'cosine') {
                    distance = 1 - Spider.cosineSimilarity(Spider.termFrequency(str1), Spider.termFrequency(str2));
                } else if (algorithm === 'levenshtein') {
                    distance = Spider.levenshteinDistance(str1, str2);
                } else if (algorithm === 'damerau') {
                    distance = Spider.damerauLevenshteinDistance(str1, str2);
                }

                if (distance <= threshold) {
                    group.push(str1);
                    foundGroup = true;
                }
            });

            if (!foundGroup) {
                groups.push([str1]);
            }
        });

        return groups;
    },

    // 取css-selector
    getCssSelector: function (element) {
        if (!element.nodeType) {
            return "";
        }
        var tagName = element.tagName.toLocaleLowerCase();
        var currentSelector = tagName;
        if (tagName === "html") {
            return currentSelector;
        }
        if (element.id) {
            currentSelector += "#" + element.id;
            return currentSelector;
        } else if (element.classList.length > 0) {
            currentSelector += "." + Array.from(element.classList.values()).join(".");
        }
        return Spider.getCssSelector(element.parentElement) + ">" + currentSelector;
    },

    // 过滤对象
    filterObj: function (obj, filterFunction) {
        return Object.fromEntries(
            Object.entries(obj).filter(filterFunction));
    },

    // 获取页面中重复项目的元素数组
    getItems: function () {

        var res = {};
        // 所有有文字的元素
        $x("//text()[normalize-space()!='']/..").forEach(function (a) {
            if (!/html|body|script|style|title|head/.test(a.tagName.toLocaleLowerCase())) {
                var exp = Spider.getCssSelector(a);
                res[exp] = res[exp] || [];
                res[exp].push(a);
            }
        });
        // 过了相同路径的元素数组,保留相同路径多内容的数据
        res = Spider.filterObj(res, function ([key, value]) {
            return value.length > 3;
        });
        return res;
    },
    getPathGroups: function(){
        // 获取css路径
        var stringArray = Object.keys(Spider.getItems());
        // 计算相似度
        var cosineSimilarities = Spider.calculateSimilarities(stringArray, 'cosine');
        var levenshteinSimilarities = Spider.calculateSimilarities(stringArray, 'levenshtein');
        
        var gf = cosineSimilarities["average"];
        // 分组
        cosineSimilarities = Spider.groupStringsBySimilarity(stringArray, gf, 'cosine');
        
        gf = levenshteinSimilarities["average"];
        levenshteinSimilarities = Spider.groupStringsBySimilarity(stringArray, gf, 'levenshtein');
        return [cosineSimilarities,levenshteinSimilarities];
    }
}
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 213,014评论 6 492
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 90,796评论 3 386
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 158,484评论 0 348
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 56,830评论 1 285
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 65,946评论 6 386
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 50,114评论 1 292
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,182评论 3 412
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 37,927评论 0 268
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,369评论 1 303
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 36,678评论 2 327
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 38,832评论 1 341
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,533评论 4 335
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,166评论 3 317
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 30,885评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,128评论 1 267
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 46,659评论 2 362
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 43,738评论 2 351

推荐阅读更多精彩内容