from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bs0bj = BeautifulSoup(html, 'lxml')
namelist = bs0bj.findAll('span', {'class': 'green'}) # 获取页面所有指定标签
for name in namelist:
print(name.get_text())
2、处理子标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bs0bj = BeautifulSoup(html, 'lxml')
for child in bs0bj.find('table', {'id': 'giftList'}).children:
print(child)
3、处理兄弟标签
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
for sibling in bsObj.find("table", {"id": "giftList"}).tr.next_siblings:
print(sibling)
# previous_siblings 前一组
# next_siblings 后一组
# previous_sibling前一个
# next_siblings后一个
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = BeautifulSoup(html, 'lxml')
images = bsObj.findAll("img", {"src": re.compile(r"../img/gifts/img.*.jpg")})
for image in images:
print(image["src"])