修正根标签对称性特征和空属性标签等问题。
# -*- coding: UTF-8 -*-
# !/usr/bin/env python
'''
author: MRN6
blog: qq_21264377@blog.csdn.net
updated: Jul. 14th, 2020 Tues. 09:29PM
'''
import re
def symmetry(tag, submatches):
#最小对称法
#minimal symmetry
check_start_tag='<'+tag
check_end_tag='</'+tag+'>'
start_tag_length=len(check_start_tag)
end_tag_length=len(check_end_tag)
#注:参考https://blog.csdn.net/iteye_13785/article/details/82638686
matches=[]
if len(submatches)>0:
for match in submatches:
if match is None or len(match)<=0:
continue
length=len(match[0])
index=0
start=match[0].find(check_start_tag, index)
if start>=0:
require=1
index = start_tag_length
while require>0 and (index<length-1-start_tag_length and index<length-1-end_tag_length) :
index=index+1
if match[0][index:index+start_tag_length]==check_start_tag:
require=require+1
elif match[0][index:index+end_tag_length]==check_end_tag:
require=require-1
matches.append(match[0][:index+end_tag_length])
return matches
#定义path规则
def qpath(path=None, html=None):
if path is None or html is None:
return []
rules=path.split("//")
matches=[]
submatches=[]
c=0
l=len(rules)
for rule in rules:
c=c+1
#print(str(c))
if len(rule.strip())<1:
continue
attributecontent=''
if ':' in rule:
ruledatas=rule.split(':')
tag=ruledatas[0]
attributedatas=ruledatas[1].split('=')
attribute=attributedatas[0]
value=attributedatas[1]
attributecontent=attribute+'="'+value+'[^"]*"'
print('<'+tag+' '+attributecontent)
else:
tag=rule
print(tag)
print(attributecontent)
#matches=re.findall('(<'+tag+'[^<>]*'+attributecontent+'[^<>]*>((?!<'+tag+'[^<>]*'+attribute+'="'+value+'"[^<>]*>).)*</'+tag+'>$)', html, re.M|re.S|re.I)
rulelength=-1
if attributecontent is None or attributecontent=='':
rulelength=0
else:
rulelength=len(ruledatas)
if c==2:
if rulelength<=1:
submatches=re.findall('(<'+tag+'[^<>]*>((?!<'+tag+'[^<>]*>).)*</'+tag+'>$)', html, re.M|re.S|re.I)
elif rulelength==2:
submatches=re.findall('(<'+tag+'[^<>]*'+attributecontent+'[^<>]*>((?!<'+tag+'[^<>]*'+attributecontent+'"[^<>]*>).)*</'+tag+'>$)', html, re.M|re.S|re.I)
elif rulelength==3 and ruledatas[2]=='END':
submatches=re.findall('(<'+tag+'[^<>]*'+attributecontent+'[^<>]*>((?!</'+tag+'>).)*</'+tag+'>$)', html, re.M|re.S|re.I)
#print(submatches)
elif c>2 and c<l:
temp=submatches
#print(submatches)
for ematch in temp:
if ematch=='':
continue
#print(ematch)
#print('rule length:'+str(rulelength))
match_html=''
if ematch[0]=='<':
match_html=ematch
else:
match_html=ematch[0]
if rulelength<=1:
submatches=re.findall('(<'+tag+'[^<>]*>((?!<'+tag+'[^<>]*>).)*</'+tag+'>$)', match_html, re.M|re.S|re.I)
elif rulelength==2:
submatches=re.findall('(<'+tag+'[^<>]*'+attributecontent+'[^<>]*>((?!<'+tag+'[^<>]*'+attributecontent+'"[^<>]*>).)*</'+tag+'>$)', match_html, re.M|re.S|re.I)
elif rulelength==3 and ruledatas[2]=='END':
submatches=re.findall('(<'+tag+'[^<>]*'+attributecontent+'[^<>]*>((?!</'+tag+'>).)*</'+tag+'>$)', match_html, re.M|re.S|re.I)
#print(str(len(submatches)))
#print(submatches)
else:
#c==l
temp=submatches
#print(submatches)
for ematch in temp:
if ematch=='':
continue
match_html=''
if ematch[0]=='<':
match_html=ematch
else:
match_html=ematch[0]
submatches=re.findall('(<'+tag+'[^<>]*'+attributecontent+'[^<>]*>((?!</'+tag+'>).)*</'+tag+'>$)', match_html, re.M|re.S|re.I)
for submatch in submatches:
matches.append(submatch)
#print('[index]'+str(c)+'[total]'+str(l))
if c==2 and tag in ['div', 'ul', 'p', 'a']:
#检查标签对称问题
submatches=symmetry(tag, submatches)
#print(submatches)
return matches
html='''
<!DOCTYPE html>
<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge, chrome=1">
<title>标题</title>
</head>
<body>
<div id="root">
<div class="content-item first">
<div class="content-title">title1</div>
<div class="content-body">content1</div>
</div>
<div class="content-item">
<div class="content-title">title_2</div>
<div class="content-body">content2</div>
</div>
<div> </div>
<div class="content-item">
<div class="content-title">title3_</div>
<div class="content-body">content3</div>
</div>
</div>
<div> </div>
</body>
</html>
'''
html2='''
<!DOCTYPE HTML>
<!--[if IE 6 ]> <html id="ne_wrap" class="ne_ua_ie6 ne_ua_ielte8"> <![endif]-->
<!--[if IE 7 ]> <html id="ne_wrap" class="ne_ua_ie7 ne_ua_ielte8"> <![endif]-->
<!--[if IE 8 ]> <html id="ne_wrap" class="ne_ua_ie8 ne_ua_ielte8"> <![endif]-->
<!--[if IE 9 ]> <html id="ne_wrap" class="ne_ua_ie9"> <![endif]-->
<!--[if (gte IE 10)|!(IE)]><!--> <html id="ne_wrap" phone="1"> <!--<![endif]-->
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="model_url" content="http://news.163.com/special/index2015/" />
<title>网易新闻</title>
<base target="_blank" />
<meta name="keywords" content="新闻,新闻中心,新闻频道,时事报道" />
<meta name="description" content="新闻,新闻中心,包含有时政新闻,国内新闻,国际新闻,社会新闻,时事评论,新闻图片,新闻专题,新闻论坛,军事,历史,的专业时事报道门户网站" />
<link rel="stylesheet" href="https://static.ws.126.net/163/f2e/news/index2016_rmd/css/head~bbb97b2a2256c.css">
</head>
<body class="news_pc" ne-module="/news/index2016_rmd/index2016.js" ne-class="{{myState.isNs9 ? 'ns9' : 'ns12'}}" ne-plugin="/modules/plugins/lazyload/lazyload.js">
<!-- 热点排行 开始 -->
<div class="mt35 mod_hot_rank clearfix" ne-class="{{myState.isReadypage ? 'cm_area_show' : ''}}">
<div class="idx_cm_title">
<a href="http://news.163.com/rank/" class="title">热点排行</a>
</div>
<ul>
<li class=" top ">
<em>1</em>
<a href="https://news.163.com/20/0713/00/FHCI7FA30001899O.html">公交坠湖系司机蓄意报复社会 人民日报:罪不容赦</a>
<span>295922</span>
</li>
<li class=" top ">
<em>2</em>
<a href="https://news.163.com/20/0713/19/FHEKUNB400018AOR.html">贵州独山县回应纪录片“如何烧掉400亿”:正核实</a>
<span>113684</span>
</li>
<li class=" top ">
<em>3</em>
<a href="https://news.163.com/20/0713/12/FHDSN8OK0001899O.html">女孩吃饭疑遭男伴下药 店员发现后换走水杯保护她</a>
<span>94240</span>
</li>
<li class="">
<em>4</em>
<a href="https://news.163.com/20/0713/15/FHE4PHU70001899O.html">小学生研究结直肠癌获奖 中科院:其系研究员之子 </a>
<span>74419</span>
</li>
<li class="">
<em>5</em>
<a href="https://news.163.com/20/0713/14/FHE1CJQ00001899O.html">小学生"新基因对癌细胞影响"研究获全国大奖受质疑</a>
<span>74327</span>
</li>
<li class="">
<em>6</em>
<a href="https://news.163.com/20/0713/20/FHEO4QDU00018AOR.html">四川14岁女孩坠亡 生前疑遭一公司老板强奸致孕</a>
<span>65168</span>
</li>
<li class="">
<em>7</em>
<a href="https://news.163.com/20/0713/14/FHE3LJ5000018AOR.html">美军机再度逼近广东海岸125公里处?这次机型不多见</a>
<span>40234</span>
</li>
<li class="">
<em>8</em>
<a href="https://news.163.com/20/0713/01/FHCL470200018AOP.html">这一轮南方洪水屡屡"突破历史极值" 原因是什么?</a>
<span>29173</span>
</li>
<li class="">
<em>9</em>
<a href="https://news.163.com/20/0713/10/FHDKDE100001899O.html">鄱阳湖各水文站全线告急!其中4个现超98年洪水水位</a>
<span>25148</span>
</li>
<li class="">
<em>10</em>
<a href="http://v.163.com/paike/VFG0QCFVN/VEGEM1C01.html">武汉长江水位暴涨到了惊人的28.6m,三镇江滩全部关闭,看着吓人</a>
<span>24799</span>
</li>
</ul>
</div>
<!-- 热点排行 结束 -->
<div class="mt25 mod_ad_4 mod_ad_r">
<!-- 300*250 -->
<div class="at_item" >
<!-- 广告位:网易-新闻频道-首页-M4 -->
<div id="ssp_6905142"></div>
<script>
(function() {
(window.slotbydup=window.slotbydup || []).push({
id: '6905142',
container: 'ssp_6905142',
size: '300,250',
display: 'inlay-fix',
async: true
});
})();
</script>
</div>
</div>
<!-- 财经 开始 -->
<div class="mt35 mod_money" ne-class="{{myState.isReadypage ? 'cm_area_show' : ''}}">
<div class="idx_cm_title">
<h2 class="title"><a href="http://money.163.com/">财经</a></h2>
</div>
<div class="idx_cm_img">
<a href="https://money.163.com/20/0713/07/FHD9R54000258152.html">
<img ne-lazy="effect:fadeIn;slideIndex:0;" data-original="https://cms-bucket.ws.126.net/2020/0713/de47cd3aj00qddwui0005c0008c0046c.jpg?imageView&thumbnail=300y150" width="300" height="150" alt="券商:A股正经历"长牛" ">
<div class="bg">
<h3>券商:A股正经历"长牛" </h3>
</div>
</a>
</div>
<ul class="mt12 idx_cm_list idx_cm_list_h">
<li>
<a href="https://money.163.com/20/0713/07/FHDB4P0L00259DLP.html">马云在阿里持股降至5% 蒋凡被除名合伙人</a>
</li>
<li>
<a href="https://money.163.com/20/0713/07/FHDAS0PH00259DLP.html">烧光84亿!我所亲历的拜腾造车大溃败 </a>
</li>
<li>
<a href="https://money.163.com/20/0713/07/FHD9S8DP00259DLP.html">全国农村集体家底摸清:账面资产6.5万亿元</a>
</li>
<li>
<a href="https://money.163.com/20/0713/07/FHD9G27600259DLP.html">贝索斯前妻成美国女首富 身家4400亿!</a>
</li>
</ul>
</div>
<!-- 财经 结束 -->
<!-- 体育 开始 -->
<div class="mt27 mod_sports" ne-class="{{myState.isReadypage ? 'cm_area_show' : ''}}">
<div class="idx_cm_title">
<h2 class="title"><a href="http://sports.163.com/">体育</a></h2>
</div>
<div class="idx_cm_img">
<a href="https://sports.163.com/20/0713/22/FHETPD9800058781.html">
<img ne-lazy="effect:fadeIn;slideIndex:0;" data-original="https://cms-bucket.ws.126.net/2020/0713/68fb1c18j00qdewiv000hc0008c005kc.jpg?imageView&thumbnail=300y150" width="300" height="150" alt="无证驾驶!迷你罗驾摩托艇涉嫌违法">
<div class="bg">
<h3>无证驾驶!迷你罗驾摩托艇涉嫌违法</h3>
</div>
</a>
</div>
<ul class="mt12 idx_cm_list idx_cm_list_h">
<li>
<a href="http://sports.163.com/nba/">乔治:我爱LA 喜欢和LBJ一起打球</a>
</li>
<li>
<a href="http://sports.163.com/18/0613/03/DK5AC8750005877U.html">格林:3年前降薪就在等KD</a> <a target="_blank" href="http://sports.163.com/18/0613/06/DK5L3C180005877U.html">特制T恤嘲讽LBJ</a>
</li>
<li>
<a href="http://sports.163.com/18/0613/11/DK679LN20005877U.html">塔克4000双鞋让保罗羡慕嫉妒 乔丹被震惊</a>
</li>
<li>
<a href="http://sports.163.com/cba/">CBA下季新赛制:常规赛4组循环 增至46轮</a>
</li>
</ul>
</div>
<!-- 体育 结束 -->
<div class="mt27 mod_ad_5 mod_ad_r" ne-class="{{myState.isReadypage ? 'cm_area_show' : ''}}">
<!-- 300*250 -->
<div class="at_item" >
<!-- 广告位:网易-新闻频道-首页-M5 -->
<div id="ssp_6905143"></div>
</body>
</html>
'''
mypath="//div:id=root//div:class=content-item"
mypath2="//div:id=root//div:class=content-item//div:class=content-title:END"
mypath3="//div:class=mt35 mod_hot_rank clearfix//ul//li:class= top "
'''
results=qpath(mypath, html)
print('result(s):'+str(len(results)))
counter=0
for result in results:
counter=counter+1
print(str(counter))
print(result)
results2=qpath(mypath2, html)
print('result(s):'+str(len(results2)))
counter=0
for result in results2:
counter=counter+1
print(str(counter))
print(result)
'''
results3=qpath(mypath3, html2)
print('result(s):'+str(len(results3)))
counter=0
for result in results3:
counter=counter+1
print(str(counter))
print(result)
运行效果如下图:
- 输入规则path="//div:class=mt35 mod_hot_rank clearfix//ul//li:class= top "
- 输入规则path="//div:class=mt35 mod_hot_rank clearfix//ul//li"
- 输入规则path="//div:class=mt35 mod_hot_rank clearfix//ul//li:class="