requests+BeautifulSoup 实现猫眼TOP100抓取

import requests
from bs4 import BeautifulSoup
import bs4
import pprint


def get_html(url, headers):
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        print("ERROR")


def fill_list(html, ulist):
    soup = BeautifulSoup(html, 'html.parser')
    for dd in soup('dd'):
        if isinstance(dd, bs4.element.Tag):
            rank = dd.find('i').string
            name = dd.find_all('p')[0].string
            stat = "".join(dd.find_all('p')[1].string.split())
            releasetime = dd.find_all('p')[2].string
            score = dd.find_all('i')[1].string + dd.find_all('i')[2].string
            ulist.append([rank, name, stat, releasetime, score])


if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit'
                      '/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari'
                      '/537.36'
    }
    bash_url = 'http://maoyan.com/board/4'
    deep = 10
    ulist = []
    for j in range(deep):
        url = bash_url + '?offset=' + str(j * 10)
        html = get_html(url, headers)
        fill_list(html, ulist)
    pprint.pprint(ulist)  # 这里使用pprint是为了打印漂亮一点,实际上可以不用的,可以直接入库

爬取效果:


image.png
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容