获取商品网页的信息

有两个需要关注的点:

  1. BeautifulSoup的findall()方法
  2. 获取review的数量,有没有更简洁的方法?
#!/usr/bin/env python
# -*- coding: utf-8 -*-


from bs4 import BeautifulSoup

goods_info = []

with open('/Users/kain/Documents/Course/Plan-for-combating-master/week1/1_2/1_2answer_of_homework/index.html', 'r') as web_data:
    soup_obj = BeautifulSoup(web_data, 'lxml')

    images = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > img')
    titles = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
    prices = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
    # stars = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p > span')
    # stars = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p')
    stars = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.ratings')
    reviews = soup_obj.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
    # print stars

    for image, title, price, star, review in zip(images, titles, prices, stars, reviews):
        data = {
            "image": image.get('src'),
            "title": title.get_text(),
            "price": price.get_text(),
            # 从Beautiful Soup的4.1.1版本开始,可以通过 class_ 参数搜索有指定CSS类名的tag
            "star": len(star.find_all("span", class_="glyphicon glyphicon-star")),
            "review": review.get_text()
        }

        # print data
        goods_info.append(data)

    # 获取3星,且评分量在30以上的商品的名称和价格
    # int(str(each_goods['review'].split(' ')[0])),获取review的数量
    for each_goods in goods_info:
        if each_goods['star'] > 3 and int(str(each_goods['review'].split(' ')[0])) > 30:
            print each_goods['title'], each_goods['price']



最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容