爬取目标:B站各类视频30日排行
代码
import requests
from lxml import etree
import xlwt
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
params = [
{'type': '全部', 'code': '0'},
{'type': '动画', 'code': '1'},
{'type': '国创相关', 'code': '168'},
{'type': '音乐', 'code': '3'},
{'type': '舞蹈', 'code': '129'},
{'type': '游戏', 'code': '4'},
{'type': '科技', 'code': '36'},
{'type': '数码', 'code': '188'},
{'type': '生活', 'code': '160'},
{'type': '鬼畜', 'code': '119'},
{'type': '时尚', 'code': '155'},
{'type': '娱乐', 'code': '5'},
{'type': '影视', 'code': '181'}
]
all_info_list = []
def get_info(url, type):
res = requests.get(url, headers=headers)
html = etree.HTML(res.text)
infos = html.xpath('//ul[@class="rank-list"]/li')
for info in infos:
rank = info.xpath('div[1]/text()')[0]
name = info.xpath('div[2]/div[2]/a/text()')[0]
players = info.xpath('div[2]/div[2]/div[1]/span[1]/text()')[0]
comments = info.xpath('div[2]/div[2]/div[1]/span[2]/text()')[0]
author = info.xpath('div[2]/div[2]/div[1]/a/span/text()')[0]
score = info.xpath('div[2]/div[2]/div[2]/div/text()')[0]
info_list = [rank, name, players, comments, author, score]
all_info_list.append(info_list)
if __name__ == '__main__':
book = xlwt.Workbook(encoding='utf-8')
for param in params:
sheet = book.add_sheet(param['type'])
header = ['排名', '视频', '播放量', '弹幕量', '作者', '综合得分']
for t in range(len(header)):
sheet.write(0, t, header[t])
url = 'https://www.bilibili.com/ranking/all/{}/0/30'.format(str(param['code']))
get_info(url, param['type'])
i = 1
for list in all_info_list:
j = 0
for data in list:
sheet.write(i, j, data)
j += 1
i += 1
all_info_list = []
time.sleep(2)
book.save('C:/Users/user/Desktop/B站30日排行.xls')
结果
其他省略...