chrome浏览器中网页文字列表类数据,通用抽取方案。目的是自动识别可以抽取的文字类列表数据,用于爬虫自动生成xpath脚本。
半成品记录
// xpath获取元素
function $x(xpath, context) {
context = context || document;
const result = [];
const nodesSnapshot = document.evaluate(xpath, context, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
for (let i = 0; i < nodesSnapshot.snapshotLength; i++) {
result.push(nodesSnapshot.snapshotItem(i));
}
return result;
}
function getXPath(element) {
if (element === document) {
return '/html';
}
var path = '';
for (; element && element.nodeType === Node.ELEMENT_NODE; element = element.parentNode) {
var tagName = element.nodeName.toLowerCase();
var index = 0;
var hasFollowingSiblings = false;
// Determine the index of the current element among its siblings with the same tag name
for (var sibling = element.previousSibling; sibling; sibling = sibling.previousSibling) {
if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName.toLowerCase() === tagName) {
index++;
}
}
// Check if there are following siblings with the same tag name
for (var sibling = element.nextSibling; sibling && !hasFollowingSiblings; sibling = sibling.nextSibling) {
if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName.toLowerCase() === tagName) {
hasFollowingSiblings = true;
}
}
// If there are following siblings with the same tag name, we include the index in the XPath
var indexPart = (hasFollowingSiblings ? '[' + (index + 1) + ']' : '');
// Prepend the current element's path
path = '/' + tagName + indexPart + path;
}
return path;
}
// 获取xpath最短路径
function getShortXPath(element) {
if (!element || element.nodeType !== 1) {
return "";
}
let path = "";
let current = element;
while (current.parentNode) {
const siblings = current.parentNode.childNodes;
const index = Array.from(siblings).indexOf(current);
if (index !== -1) {
path = `/${current.tagName.toLowerCase()}` + path;
}
current = current.parentNode;
}
return path;
}
// 获取页面中所有有文本内容的元素
function extractSimilarRegions() {
// 使用XPath查询所有包含文本的元素
const elementsWithText = $x("//text()[normalize-space()!='']/..").filter(element => {
return !/html|body|script|style|title|head/.test(element.tagName.toLowerCase());
});
// 归类元素
const groupedElements = elementsWithText.reduce((acc, element) => {
const shortXPath = getXPath(element).replace(/\[\d+\](?!.*\[\d+\])/g,"[*]").replace(/\[\d+\](?!.*\[\d+\])/g,"[*]");
//.replace(/\[\d+\](?!.*\[\d+\])/,"[*]");
//const shortXPath = getShortXPath(element);
const text = element.textContent.trim();
const parentXPath = shortXPath.substring(0, shortXPath.lastIndexOf('/'));
// 按照shortPath路径长度、元素一致、层级一致进行归类
if (!acc[parentXPath]) {
acc[parentXPath] = [];
}
acc[parentXPath].push({ element, shortXPath, text});
return acc;
}, {});
// 转换为二维数组,并按照跟父亲元素最近距离排序
const groupedSimilarRegions = Object.keys(groupedElements).map(parentXPath => {
return groupedElements[parentXPath].sort((a, b) => a.shortXPath.length - b.shortXPath.length);
}).filter(group => group.length >= 3);
return groupedSimilarRegions;
}
// 使用extractSimilarRegions函数
const similarRegions = extractSimilarRegions();
console.log(similarRegions);
原始
function $x(xpath, context) {
context = context || document;
const result = [];
const nodesSnapshot = document.evaluate(xpath, context, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
for (let i = 0; i < nodesSnapshot.snapshotLength; i++) {
result.push(nodesSnapshot.snapshotItem(i));
}
return result;
}
function getXPath(element) {
if (element === document) {
return '/html';
}
var path = '';
for (; element && element.nodeType === Node.ELEMENT_NODE; element = element.parentNode) {
var tagName = element.nodeName.toLowerCase();
var index = 0;
var hasFollowingSiblings = false;
// Determine the index of the current element among its siblings with the same tag name
for (var sibling = element.previousSibling; sibling; sibling = sibling.previousSibling) {
if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName.toLowerCase() === tagName) {
index++;
}
}
// Check if there are following siblings with the same tag name
for (var sibling = element.nextSibling; sibling && !hasFollowingSiblings; sibling = sibling.nextSibling) {
if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName.toLowerCase() === tagName) {
hasFollowingSiblings = true;
}
}
// If there are following siblings with the same tag name, we include the index in the XPath
var indexPart = (hasFollowingSiblings ? '[' + (index + 1) + ']' : '');
// Prepend the current element's path
path = '/' + tagName + indexPart + path;
}
return path;
}
function createTemplateWithWildcard(strings) {
if (strings.length === 0) return '';
// 使用第一个字符串作为模板的起始点,并分割成数组
let templateParts = strings[0].split('/');
// 遍历模板的每个部分,检查数字是否在所有字符串中相同
templateParts = templateParts.map((part, index) => {
// 如果部分包含数字,检查所有字符串的相应部分
if (/\[\d+\]/.test(part)) {
for (let str of strings) {
let parts = str.split('/');
// 如果当前数字在任意字符串中不同,替换为 '*'
if (part !== parts[index]) {
return part.replace(/\[\d+\]/, '[*]');
}
}
}
return part;
});
// 将处理后的模板部分重新组合成一个字符串
return templateParts.join('/');
}
// 页面所有的文本元素查询归类。将相同层级机构的元素归类到一起
function extractSimilarRegions() {
// 使用XPath查询所有包含文本的元素
const elementsWithText = $x("//text()[normalize-space()!='']/..").filter(element => {
return !/html|body|script|style|title|head/.test(element.tagName.toLowerCase());
});
// 归类元素
const groupedElements = elementsWithText.reduce((acc, element) => {
const xPath = getXPath(element);
const shortXPath = xPath.substring(0, xPath.lastIndexOf('/') + 1).replace(/(\[\d+\])/g, "[*]") + xPath.substring(xPath.lastIndexOf('/'));
const text = element.textContent.trim();
// 按照shortPath路径长度、元素一致、层级一致进行归类
if (!acc[shortXPath]) {
acc[shortXPath] = [];
}
acc[shortXPath].push({
element,
xPath,
text,
});
return acc;
}, {});
// 转换为二维数组,并按照跟父亲元素最近距离排序
const groupedSimilarRegions = Object.keys(groupedElements).map(parentXPath => {
return groupedElements[parentXPath].sort((a, b) => a.xPath.length - b.xPath.length);
}).filter(group => group.length >= 3);
// 创建模板并添加到相似区域数组中
const regionsWithTemplates = groupedSimilarRegions.map(group => {
const xPaths = group.map(item => item["xPath"]);
const template = createTemplateWithWildcard(xPaths);
return {
elements: group,
template: template
};
});
return regionsWithTemplates;
}
// 按路径递归,筛选归类父路径相似度一样的xpath
function groupXPathsBySimilarity(xPaths) {
const groups = {};
xPaths.forEach(xpath => {
// 将XPath按'/'分割成数组
const pathParts = xpath.split('/').filter(part => part);
// 从最深的路径开始寻找公共父路径
let commonPath = pathParts.join('/');
while (commonPath) {
if (!groups[commonPath]) {
groups[commonPath] = [];
}
groups[commonPath].push(xpath);
// 移除路径的最后一部分,寻找更短的公共路径
commonPath = commonPath.substring(0, commonPath.lastIndexOf('/'));
}
});
// 筛选出具有多个XPath的组
const filteredGroups = {};
Object.keys(groups).forEach(path => {
if (groups[path].length > 1) {
filteredGroups[path] = groups[path];
}
});
// 返回分组结果
return filteredGroups;
}
// 查找页面文本元素,进行列表归类
const similarRegions = extractSimilarRegions();
console.log(similarRegions);
// 获取归类的xpath模板路径
a = similarRegions.map(group => {
return group["template"];
});
console.log(a);
// 将xpath模板路径,按照父路径进行归类
const groupedXPaths = groupXPathsBySimilarity(a);
console.log(groupedXPaths);
function difference(arr1, arr2) {
const set2 = new Set(arr2);
return arr1.filter(x => !set2.has(x));
}
var keys = Object.keys(groupedXPaths).sort();
const groupedXPaths1 = keys.reduce(function(accumulator, currentValue, currentIndex, array) {
if(currentIndex == 0) return accumulator;
if(groupedXPaths[currentValue].length != groupedXPaths[keys[currentIndex-1]].length){
accumulator.push(difference(groupedXPaths[keys[currentIndex-1]],groupedXPaths[currentValue]));
}
return accumulator;
},[]);
console.log(groupedXPaths1);
xpath通过文本定位
$x("//text()[normalize-space(.) = 'Blackout Python']/../../../preceding-sibling::*[1]").forEach(function(a){setTimeout(function(){a.click()},500)})
$x方法实现
function $x(xpath, contextNode) {
contextNode = contextNode || document; // 如果没有提供上下文节点,则使用整个文档
var result = document.evaluate(xpath, contextNode, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
var nodes = [];
for (var i = 0; i < result.snapshotLength; i++) {
nodes.push(result.snapshotItem(i));
}
return nodes;
}
$x( "//*[contains(string(),'B') and contains(@*,'htm')]")
非空文字节点
document.evaluate("//text()[normalize-space(.)!='']",document).iterateNext()
var Spider = {
// 抽取公共路径
findCommonStart: function (str1, str2) {
let common = '';
for (let i = 0; i < str1.length && i < str2.length; i++) {
if (str1[i] === str2[i]) {
common += str1[i];
} else {
break;
}
}
return common.replace(/\/.*?$/i,"");
},
// 余弦向量算法
cosineSimilarity: function (vecA, vecB) {
var dotProduct = 0;
var magnitudeA = 0;
var magnitudeB = 0;
for (var key in vecA) {
dotProduct += vecA[key] * (vecB[key] || 0);
magnitudeA += vecA[key] * vecA[key];
}
for (var key in vecB) {
magnitudeB += vecB[key] * vecB[key];
}
magnitudeA = Math.sqrt(magnitudeA);
magnitudeB = Math.sqrt(magnitudeB);
if (magnitudeA === 0 || magnitudeB === 0) {
return 0;
}
return dotProduct / (magnitudeA * magnitudeB);
},
// 将字符串转换为向量
termFrequency: function (str) {
var terms = str.split('');
var frequency = {};
terms.forEach(term => {
frequency[term] = (frequency[term] || 0) + 1;
});
return frequency;
},
// 编辑距离算法
// 计算两个字符串之间的Levenshtein距离
levenshteinDistance: function (a, b) {
if (a.length === 0)
return b.length;
if (b.length === 0)
return a.length;
var matrix = [];
// 初始化矩阵
for (var i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (var j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
// 计算编辑距离
for (var i = 1; i <= b.length; i++) {
for (var j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // 替换
Math.min(
matrix[i][j - 1] + 1, // 插入
matrix[i - 1][j] + 1 // 删除
));
}
}
}
return matrix[b.length][a.length];
},
// Damerau-Levenshtein(D-Card)算法
damerauLevenshteinDistance: function (a, b) {
var lenA = a.length;
var lenB = b.length;
var matrix = new Array(lenB + 1).fill(null).map(() => new Array(lenA + 1).fill(0));
for (var i = 0; i <= lenA; i++) {
matrix[0][i] = i;
}
for (var j = 0; j <= lenB; j++) {
matrix[j][0] = j;
}
for (var i = 1; i <= lenB; i++) {
for (var j = 1; j <= lenA; j++) {
var cost = a[j - 1] === b[i - 1] ? 0 : 1;
matrix[i][j] = Math.min(
matrix[i - 1][j] + 1, // 删除
matrix[i][j - 1] + 1, // 插入
matrix[i - 1][j - 1] + cost // 替换
);
if (i > 1 && j > 1 && a[j - 1] === b[i - 2] && a[j - 2] === b[i - 1]) {
matrix[i][j] = Math.min(
matrix[i][j],
matrix[i - 2][j - 2] + cost // 交换
);
}
}
}
return matrix[lenB][lenA];
},
// 计算字符串数组中字符串之间的相似度
calculateSimilarities: function (strings, algorithm) {
var sumDistance = 0;
var maxDistance = 0;
var minDistance = Infinity;
var count = 0;
// 根据选择的算法计算距离
for (var i = 0; i < strings.length; i++) {
for (var j = i + 1; j < strings.length; j++) {
var distance;
if (algorithm === 'cosine') {
distance = 1 - Spider.cosineSimilarity(Spider.termFrequency(strings[i]), Spider.termFrequency(strings[j]));
} else if (algorithm === 'levenshtein') {
distance = Spider.levenshteinDistance(strings[i], strings[j]);
} else if (algorithm === 'damerau') {
distance = Spider.damerauLevenshteinDistance(strings[i], strings[j]);
}
sumDistance += distance;
maxDistance = Math.max(maxDistance, distance);
minDistance = Math.min(minDistance, distance);
count++;
}
}
// 计算平均值
var averageDistance = sumDistance / count;
return {
average: averageDistance,
max: maxDistance,
min: minDistance
};
},
// 对字符串数组进行归类
groupStringsBySimilarity: function (strings, threshold, algorithm) {
var groups = [];
strings.forEach((str1, index1) => {
var foundGroup = false;
groups.forEach((group, index2) => {
var str2 = group[0];
var distance;
if (algorithm === 'cosine') {
distance = 1 - Spider.cosineSimilarity(Spider.termFrequency(str1), Spider.termFrequency(str2));
} else if (algorithm === 'levenshtein') {
distance = Spider.levenshteinDistance(str1, str2);
} else if (algorithm === 'damerau') {
distance = Spider.damerauLevenshteinDistance(str1, str2);
}
if (distance <= threshold) {
group.push(str1);
foundGroup = true;
}
});
if (!foundGroup) {
groups.push([str1]);
}
});
return groups;
},
// 取css-selector
getCssSelector: function (element) {
if (!element.nodeType) {
return "";
}
var tagName = element.tagName.toLocaleLowerCase();
var currentSelector = tagName;
if (tagName === "html") {
return currentSelector;
}
if (element.id) {
currentSelector += "#" + element.id;
return currentSelector;
} else if (element.classList.length > 0) {
currentSelector += "." + Array.from(element.classList.values()).join(".");
}
return Spider.getCssSelector(element.parentElement) + ">" + currentSelector;
},
// 过滤对象
filterObj: function (obj, filterFunction) {
return Object.fromEntries(
Object.entries(obj).filter(filterFunction));
},
// 获取页面中重复项目的元素数组
getItems: function () {
var res = {};
// 所有有文字的元素
$x("//text()[normalize-space()!='']/..").forEach(function (a) {
if (!/html|body|script|style|title|head/.test(a.tagName.toLocaleLowerCase())) {
var exp = Spider.getCssSelector(a);
res[exp] = res[exp] || [];
res[exp].push(a);
}
});
// 过了相同路径的元素数组,保留相同路径多内容的数据
res = Spider.filterObj(res, function ([key, value]) {
return value.length > 3;
});
return res;
},
getPathGroups: function(){
// 获取css路径
var stringArray = Object.keys(Spider.getItems());
// 计算相似度
var cosineSimilarities = Spider.calculateSimilarities(stringArray, 'cosine');
var levenshteinSimilarities = Spider.calculateSimilarities(stringArray, 'levenshtein');
var gf = cosineSimilarities["average"];
// 分组
cosineSimilarities = Spider.groupStringsBySimilarity(stringArray, gf, 'cosine');
gf = levenshteinSimilarities["average"];
levenshteinSimilarities = Spider.groupStringsBySimilarity(stringArray, gf, 'levenshtein');
return [cosineSimilarities,levenshteinSimilarities];
}
}