代码
from bs4 import BeautifulSoup
info = []
with open('C:/Users/Administrator/Desktop/Pycharmprojects/OReillyWebScraping/小白/html/1-2 web/new_index.html', 'r') as web_data:
soup = BeautifulSoup(web_data, 'lxml')
titles = soup.select('body > div.main-content > ul > li > div.article-info > h3 > a')
images = soup.select('body > div.main-content > ul > li > img')
# cates这边停在了父级标签,因为原网页中“项目”和“cates”存在一对多的关系
cates = soup.select('body > div.main-content > ul > li > div.article-info > p.meta-info')
descs = soup.select('body > div.main-content > ul > li > div.article-info > p.description')
rates = soup.select('body > div.main-content > ul > li > div.rate > span')
# print(titles, images, tags, descs, rates, sep="\n-------------------------\n")
for title, image, cate, desc, rate in zip(titles, images, cates, descs, rates):
data = {
'title': title.get_text(),
'cate' : list(cate.stripped_strings), # stripped_strings相当于高级的get_text(),可以同时取出多个文本。list()是列表化
'desc' : desc.get_text(),
'rate' : rate.get_text(),
'image': image.get('src')
}
info.append(data) # 之所以建立info[]列表,是为了把多个data字典放进去之后进行迭代
# 选取评分大于3的部分
for i in info:
if float(i['rate']) > 3:
print(i['title'], i['rate'])
# 标题 body > div.main-content > ul > li:nth-child(1) > div.article-info > h3 > a
# 图片 body > div.main-content > ul > li:nth-child(1) > img
# 标签 body > div.main-content > ul > li:nth-child(1) > div.article-info > p.meta-info > span:nth-child(2)
# 评分 body > div.main-content > ul > li:nth-child(1) > div.rate > span
# 内容 body > div.main-content > ul > li:nth-child(1) > div.article-info > p.description