HTML 解析

pyquery 使用

参考

from pyquery import PyQuery as pq
from lxml import etree
#四种创建对象的方法
doc1 =pq(etree.fromstring('<div> <tr class="item-0"> <td>first section</td> <td>1111</td> <td>17-01-28 22:51</td> </tr> <tr class="item-1"> <td>second section</td> <td>2222</td> <td>17-01-28 22:53</td> </tr> </div>'))
doc2 = pq('<div><div> <tr class="item-0"> <td>first section</td> <td>1111</td> <td>17-01-28 22:51</td> </tr> <tr class="item-1"> <td>second section</td> <td>2222</td> <td>17-01-28 22:53</td> </tr> </div>')
#直接给html字符串创建对象
doc3 = pq(filename ='hello')#给html文件
doc4 = pq(url = 'http://google.com')#给url
doc('.class')#获取对应class的对象
doc('#id')#获取对应的id对象
data = doc('tr')#以list形式返回文件中tr元素
for tr in doc('tr').items:
    print(tr('td').eq(2).text)#输出tr元素中第二个td元素的文本。
doc('p').attr('id')#获取p标签的属性id值
doc('p').find('#n')#在p块中查找id

beautifulsoup4

beautifulsoup4

同上

# beautiful练习
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc,'lxml')
print(soup.prettify())#格式化输出html
print(soup.title.string)
for link in soup.find_all('a'):#获取输出所有a标签的链接
    print(link.get('href'))
print(soup.get_text())#输出所有的文本

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容