8586231_192932724000_2.jpg
- find_all方法返回的是BeautifulSoup特有的结果集,里面装的是标签对象
from bs4 import BeautifulSoup
import re
html = """
<html><head><title>The Dormouse's story</title><title>The Dormouse's story2</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
#解析字符串形式的html
soup=BeautifulSoup(html,'lxml')
①
data=soup.find_all('a')
print(type(data))
结果:
<class 'bs4.element.ResultSet'>
取值方法:
data=soup.find_all('a')
for i in data:
print(i.string)
结果:
Elsie
Lacie
Tillie
②
#根据正则表达式查找标签
data1=soup.find_all(re.compile('^b'))
for i in data1:
print(data1)
返回结果为所有以b开头的所有标签
③
#根据属性查找标签
data2=soup.find_all(id='link2')
for i in data2:
print(data2)
结果:
[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
④
#根据标签内容获取标签内容
data3=soup.find_all(text='Tillie')
data4=soup.find_all(text=['Lacie','Tillie'])
data5=soup.find_all(text=re.compile("Do"))
print(data5)
结果:data3,data4,data5
['Tillie']
['Lacie', 'Tillie']
["The Dormouse's story", "The Dormouse's story2", "The Dormouse's story"]