注意事项
from bs4 import BeautifulSoup
import re
html_file = '/Users/XXX/muggle/Plan-for-combating/week1/1_2/1_2answer_of_homework/index.html'
# 使用with open语法打开文件
# 第一个参数是文件地址;第二个参数是文件处理方式:r表示读取文件;w表示写文件
# 添加encoding指定字符集,避免乱码问题
with open(html_file,'r',encoding='utf-8') as web_data:
content = web_data.read()
soup = BeautifulSoup(content, 'lxml')
titles = soup.select("body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a")
images = soup.select("body > div > div > div.col-md-9 > div > div > div > img")
reviews = soup.select("body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right")
prices = soup.select("body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right")
stars = soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
for title, image, price, star, review in zip(titles, images, prices, stars, reviews):
data = {
'title': title.get_text(),
'image': image.get('src'),
'price': price.get_text(),
# 通过len函数获取列表长度
'star' : len(star.find_all('span', class_="glyphicon glyphicon-star")),
# 通知正则表达式获取数字
'review': int(re.search(r'\d*', review.get_text()).group())
}
print(data)