import requests
from bs4 import BeautifulSoup
import bs4
import pprint
def get_html(url, headers):
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
print("ERROR")
def fill_list(html, ulist):
soup = BeautifulSoup(html, 'html.parser')
for dd in soup('dd'):
if isinstance(dd, bs4.element.Tag):
rank = dd.find('i').string
name = dd.find_all('p')[0].string
stat = "".join(dd.find_all('p')[1].string.split())
releasetime = dd.find_all('p')[2].string
score = dd.find_all('i')[1].string + dd.find_all('i')[2].string
ulist.append([rank, name, stat, releasetime, score])
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit'
'/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari'
'/537.36'
}
bash_url = 'http://maoyan.com/board/4'
deep = 10
ulist = []
for j in range(deep):
url = bash_url + '?offset=' + str(j * 10)
html = get_html(url, headers)
fill_list(html, ulist)
pprint.pprint(ulist) # 这里使用pprint是为了打印漂亮一点,实际上可以不用的,可以直接入库
爬取效果: