提取特征:
extract_features.js会根据urls提取html特征以及chrome distilled特征,分别为name.feature以及name.dfeature
features:
'opengraph': hasOGArticle(),
'url': document.location.href,
'title': document.title,
'numElements': body.querySelectorAll('*').length,
'numAnchors': body.querySelectorAll('a').length,
'numForms': body.querySelectorAll('form').length,
'numTextInput': body.querySelectorAll('input[type="text"]').length,
'numPasswordInput': body.querySelectorAll('input[type="password"]').length,
'numPPRE': body.querySelectorAll('p,pre').length,
'innerText': body.innerText,
'textContent': body.textContent,
'innerHTML': body.innerHTML,
'mozScore': Math.min(6 * Math.sqrt(1000 - 140), _mozScore(false, 0.5, 140, true, 1000)),
'mozScoreAllSqrt': Math.min(6 * Math.sqrt(1000), _mozScore(false, 0.5, 0, true, 1000)),
'mozScoreAllLinear': Math.min(6 * 1000, _mozScore(false, 1, 0, true, 1000)),
'visibleElements': countVisible(body.querySelectorAll('*')),
'visiblePPRE': countVisible(body.querySelectorAll('p,pre')),
native:
derived features:
features = [
'id', index,
'sin', math.sin(index),
'openGraph', opengraph,
'forum', 'forum' in path,
'index', 'index' in path,
'search', 'search' in path,
'view', 'view' in path,
'archive', 'archive' in path,
'asp', '.asp' in path,
'phpbb', 'phpbb' in path,
'php', path.endswith('.php'),
'pathLength', len(path),
'domain', len(path) < 2,
'pathComponents', CountMatches(path, r'\/.'),
'slugDetector', CountMatches(path, r'[^\w/]'),
'pathNumbers', CountMatches(path, r'\d+'),
'lastSegmentLength', len(GetLastSegment(path)),
'visibleRatio', float(visibleElements) / max(1, numElements),
'visiblePPRERatio', float(visiblePPRE) / max(1, numPPRE),
'PPRERatio', float(numPPRE) / max(1, numElements),
'anchorPPRERatio', float(numAnchors) / max(1, numPPRE),
'innerTextLength', len(innerText),
'textContentLength', len(textContent),
'innerHtmlLength', len(innerHTML),
'innerTextLengthRatio', float(len(innerText)) / max(1, len(innerHTML)),
'textContentLengthRatio', float(len(textContent)) / max(1, len(innerHTML)),
'innerTexttextContentLengthRatio',float(len(innerText)) / max(1, len(textContent)),
'innerTextWordCount', innerTextWords,
'textContentWordCount', textContentWords,
'innerhtmlWordCount', innerHTMLWords,
'innerTextWordCountRatio', float(innerTextWords) / max(1, innerHTMLWords),
'textContentWordCountRatio', float(textContentWords) / max(1, innerHTMLWords),
'innerTexttextContentWordCountRatio', float(innerTextWords) / max(1, textContentWords),
'textCount', numText,
'passwordCount', numPassword,
'formCount', numForms,
'anchorCount', numAnchors,
'elementCount', numElements,
'anchorRatio', float(numAnchors) / max(1, numElements),
]
for k in sorted(raw):
if 'mozScore' in k or 'num' in k:
features += [k, raw[k]]
mozScore
function _mozScore(trim, power, cut, excludeLi, saturate) {
var score = 0;
var nodes = document.querySelectorAll('p,pre')
for (var i = 0; i < nodes.length; i++) {
var node = nodes[i];
if (!isVisible(node)) {
continue;
}
var matchString = node.className + " " + node.id;
if (unlikelyCandidates.test(matchString) &&
!okMaybeItsACandidate.test(matchString)) {
continue;
}
if (excludeLi && node.matches && node.matches("li p")) {
continue;
}
var textContent = node.textContent;
if (trim) textContent = textContent.trim();
var textContentLength = textContent.length;
textContentLength = Math.min(saturate, textContentLength)
if (textContentLength < cut) {
continue;
}
score += Math.pow(textContentLength - cut, power);
}
return score;
}
分类算法
OG_ARTICLE
meta是否包括og:type
AdaBoost
原理:https://blog.csdn.net/v_JULY_v/article/details/40718799