爬取B站视频
歌曲:暗号 - 周杰伦
Part0 懒人纯享版
- 下载地址见结尾
Part1 环境准备
- python3环境
- 安装lxml模块 # pip install lxml
- 安装requests模块 # pip install requests
- 安装ffmpeg软件 # 见下文
Part2 获取网页代码
- 通过requests模块,获取到目标页面的网页源代码。input函数接收url,并以"?"分割。
self.url = input('请输入网址:').split("?", 1)[0]
self.headers = {
"user-agent": "Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/\
91.0.4472.77Safari/537.36Edg/91.0.864.37",
}
self.htmlData = requests.get(url=self.url, headers=self.headers)
- 通过xpath定位,提取网页的title作为视频的文件名,并替换掉部分不支持作为文件名的字符。(如:空格,斜杠,制表符等)
self.title = etree.HTML(self.htmlData.content).xpath('//*[@id="viewbox_report"]/h1/span/text()')[0]\
.replace(' ','_').replace('\t','_').replace('/','_')
- 获取页面xpath的方法
- 获取页面源代码
- 使用浏览器打开获取到的源代码
- F12检查指定元素
- 右键复制元素的xpath地址
Part3 获取视频地址
- 分析网页源代码,得知视频地址存放于json格式的数据中
- 提取数据,并打印json格式数据进行分析
import pprint
# self.htmlData就是part1中获取到的源代码
jsonStr = self.htmlData.text.split('window.__playinfo__=')[-1].split('<')[0].split(';')[0]
jsons = json.loads(jsonStr)
pprint.pprint(jsons)
- 从json格式数据中,获得视频地址
data = jsons["data"]
dash = data['dash']
videos = dash['video']
audios = dash['audio']
mp3_path = audios[0]['baseUrl']
mp4_path = videos[0]['baseUrl']
Part4 下载视频
- 同样是通过requests模块,访问获得的视频地址,从而获取视频数据,只是需要在headers消息头中添加Range参数。
headers = {
'Range': 'bytes={}-{}'.format(min, max),
'Referer': 'https://www.bilibili.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/75.0.3770.100 Safari/537.36'
}
resp = requests.get(url, headers=headers)
- 创建文件夹video
import os
if not os.path.exists('video'):
os.makedirs('video')
- 将视频数据追加写入文件,以title命名
with open('video/{}.mp4'.format(self.title), 'ab+') as f:
f.write(resp.content)
Part5 视频合成
- 需要提前安装ffmpeg软件。下载地址见结尾
- 解压文件后,需要添加环境变量
- 安装后,需要重启计算机
- 利用ffmpeg进行视频合成。(将前面获得的视频与音频文件合成为output文件)
subprocess.call('ffmpeg -i video/{}.mp4 -i video/{}.mp3 -c:v copy -c:a aac -strict \
experimental video/{}_output.mp4'\
.format(self.title,self.title,self.title), shell=True)
完整代码
import requests
import json
import subprocess
from lxml import etree
import os
# import pprint
# ffmpeg 需要自己安装
# testUrl:https://www.bilibili.com/video/BV19r4y167Tf?spm_id_from=333.1007.top_right_bar_window_view_later.content.click
class getFile_fromBiliBili():
# 获取网页代码
def getHtml(self):
self.url = input('请输入网址:').split("?", 1)[0]
self.headers = {
"user-agent": "Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/\
91.0.4472.77Safari/537.36Edg/91.0.864.37",
}
self.htmlData = requests.get(url=self.url, headers=self.headers)
self.title = etree.HTML(self.htmlData.content).xpath('//*[@id="viewbox_report"]/h1/span/text()')[0]\
.replace(' ','_').replace('\t','_').replace('/','_')
# 网页代码经过分割处理,获取视频地址
def getPath(self):
jsonStr = self.htmlData.text.split('window.__playinfo__=')[-1].split('<')[0].split(';')[0]
jsons = json.loads(jsonStr)
# pprint.pprint(jsons) # 打印json格式数据
data = jsons["data"]
mp3_path = ''
mp4_path = ''
# 当音频视频分开的时候
try:
dash = data['dash']
videos = dash['video']
audios = dash['audio']
mp3_path = audios[0]['baseUrl']
mp4_path = videos[0]['baseUrl']
# 音频视频没有分开的时候
except:
mp4_path = ''
durls = data['durl']
for durl in durls:
if durl['order'] == 1:
mp4_path = durl['url']
self.mp3_path = mp3_path
self.mp4_path = mp4_path
def download_mp4(self,min, max, url):
if url == '':
return ''
headers = {
'Range': 'bytes={}-{}'.format(min, max),
'Referer': 'https://www.bilibili.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/75.0.3770.100 Safari/537.36'
}
resp = requests.get(url, headers=headers)
if resp.status_code != 206:
return resp.status_code
print(resp.status_code)
with open('video/{}.mp4'.format(self.title), 'ab+') as f:
f.write(resp.content)
return 0
def download_mp3(self,min, max, url):
# 如果没有音频
if url == '':
return ''
headers = {
'Range': 'bytes={}-{}'.format(min, max),
'Referer': 'https://www.bilibili.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/75.0.3770.100 Safari/537.36'
}
resp = requests.get(url, headers=headers)
if resp.status_code != 206:
return resp.status_code
print(resp.status_code)
with open('video/{}.mp3'.format(self.title), 'ab+') as f:
f.write(resp.content)
return 0
# 视频下载模块
def download_mp3_mp4(self,mp3_path, mp4_path):
if not os.path.exists('video'):
os.makedirs('video')
min = 0
max = 200000
mp4_status_code = 0
mp3_status_code = 0
while True:
if mp4_status_code == 0:
mp4_status_code = self.download_mp4(min, max, mp4_path)
print("mp4_status_code:", mp4_status_code)
else:
pass
if mp3_status_code == 0:
mp3_status_code = self.download_mp3(min, max, mp3_path)
print('mp3_status_code:', mp3_status_code)
else:
pass
if mp4_status_code != 0 and mp3_status_code != 0:
break
min = max + 1
max = max + 200000
# 视频合成模块
def compose(self):
try:
subprocess.call('ffmpeg -i video/{}.mp4 -i video/{}.mp3 -c:v copy -c:a aac -strict \
experimental video/{}_output.mp4'\
.format(self.title,self.title,self.title), shell=True)
print('合成完成!!!')
except:
print('合成失败!!!')
def run(self):
self.getHtml()
self.getPath()
self.download_mp3_mp4(self.mp3_path, self.mp4_path)
self.compose()
if __name__ == '__main__':
b = getFile_fromBiliBili()
b.run()
参考链接
- bilibili视频爬虫:https://blog.csdn.net/qq_45695453/article/details/105757919
- ffmpeg的安装与使用:https://blog.csdn.net/qq_39516859/article/details/81843419
下载链接
PS:微信公众号"小明阿婆"回复:"B站爬虫",可获取素材文件