序
- 作为网易云音乐的忠实用户,当然需要搞搞它~
- 在爬取的过程中是有点难度的,需要花费一些精力的。
- 像是歌曲ID的url拼接、或是关键参数的加密破解获取等等~
正
- 获取这种歌单的歌曲和歌曲下面的所有评论~
1.下载歌曲
-
下载歌曲唯一需要注意的点就是对歌曲id进行真正的url拼接
获取此id就可以进行拼接了
- 原本的歌曲url
https://music.163.com/#/song?id=1407358755 - 其实真实的url是这个(将id获取下来)
http://music.163.com/song/media/outer/url?id=1407358755.mp3
由此我进行拼接
#拼接url
musicUrl = 'http://music.163.com/song/media/outer/url?id=' + u.split('=')[1] + '.mp3'
- 注意️:携带请求头请求
- 完整下载歌曲的代码
# -*- coding: utf-8 -*-
import requests,ssl
from lxml import etree
from copyheaders import headers_raw_to_dict
import urllib.request
ssl._create_default_https_context = ssl._create_unverified_context
headers = '''
authority: music.163.com
method: GET
path: /
scheme: https
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: zh-CN,zh;q=0.9,en;q=0.8,zh-HK;q=0.7,zh-TW;q=0.6
cache-control: max-age=0
cookie: _iuqxldmzr_=32; _ntes_nnid=df5a2087ca2abb70b4bd3d6170b71dd8,1576462996114; _ntes_nuid=df5a2087ca2abb70b4bd3d6170b71dd8; WM_TID=FAq2Izmu0OhBQVVRVEdtrgGglTVpU2XD; WM_NI=Ixn8HGuhi6m6UnVf3aZbSOHiXTaPPkbaWn0cillgVZAecQsakDyUwcw%2B2U1SU2FcAMPShnFqcQcZ247MWWbO96%2B4DOE37wV%2BhJnSdG7mHVrvW4dQZSkLaSdnzu%2Blo8umOWg%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eed8d85fa59996d8b56682928eb3c85e929b9abbb825b1a9ad83b2538cedbdadf92af0fea7c3b92a8ba8a2d9cb4db7b8a7d7f03e97b29a87b26b9abe9db0cb639c8cafb1d361a2bf998fb369abb4a488d55283bbadb2f26ba99cb7b6f669b5f5feb6b45997ee8a84c447948da88fc57ba18fadbbb140f59399b8e145fbeae5d5c74587edf998d521ac96a2b4e74df5b388b7f17e95ecacb2d959a994afb4e766baabe58dee34b7b381b8e637e2a3; JSESSIONID-WYYY=Dtz2qrqubvoAeyIW1F1NCF7uF2yk3wSP%2B8Qo3jyBoIk6%2BPjP2JYn970Cc2Sf1%2Bp3%5CfFEvaoueYJmHAtnQobcm8k4hWagBqBQ4ThVZQwdennHsWMke%2Fk6vU28EqQk5uvjUvaAPCcg%2Bi7n3N4CI33dtB7cNKJ%2FkbhW9uCneafzS6i7qOfd%3A1579005850713
referer: https://music.163.com/
sec-fetch-mode: navigate
sec-fetch-site: same-origin
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'''
h = bytes(headers, encoding="utf-8")
headers = headers_raw_to_dict(h)
musicUrl_list = []
def download_parse():
url='https://music.163.com/playlist?id=2008272804'
datas = requests.get(url,headers=headers)
sku_html = etree.HTML(datas.text)
# print(datas.text)
# 歌曲链接
result_data = sku_html.xpath('//ul[@class="f-hide"]/li/a/@href')
# 歌曲名称
result_data_name = sku_html.xpath('//ul[@class="f-hide"]/li/a/text()')
# print(result_data)
print(len(result_data))
for index,u in enumerate(result_data):
#拼接url
musicUrl = 'http://music.163.com/song/media/outer/url?id=' + u.split('=')[1] + '.mp3'
# print(musicUrl)
lists = [result_data_name[index],musicUrl]
musicUrl_list.append(lists)
# 下载列表中的全部歌曲,并以歌曲名命名下载后的文件,文件位置为当前文件夹
for i in musicUrl_list:
url = i[1]
name = i[0]
try:
print('正在下载', name)
urllib.request.urlretrieve(url, './song/%s.mp3' % name)
print('下载成功')
except Exception as e:
print(e)
print('下载失败')
return result_data
2.获取歌曲下的评论
-
难度就在评论的采集获取,在获取时需要携带两个参数(params和encSecKey)才能对获得评论信息。
- 通过携带两个参数对网页进行post请求且返回json字符,再对此进行解码
sku_data = requests.post(url=self.url,data=dataf,headers=self.headers)
json_dict = json.loads(sku_data.text)
- 不过我们暂时只能获得到第一页的热门评论与最新评论,如此我们就需要对这两个参数进行解析与破解,看到这此的js加密我很懵逼
- 由此,我在知乎找到一位大佬做了很详细的分析=》链接
简单来说,就是找到关键js文件(js加密的‘发起者’),在其中寻找参数相关的代码,利用Fiddler进行调试,并且分别对四种参数进行分析比对,两次aes加密,从而得出params的结果,encSecKey是常量,直接使用即可。
- 相关代码
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
second_param = "010001"
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
forth_param = "0CoJUm6Qyw8W8jud"
def get_params(self,i):
if i == 0:
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
else:
offset = str(i * 20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' % (offset, 'flase')
iv = "0102030405060708"
first_key = self.forth_param
second_key = 16 * 'F'
h_encText = self.AES_encrypt(first_param, first_key, iv).decode('utf-8')
h_encText = self.AES_encrypt(h_encText, second_key, iv)
return h_encText
def get_encSecKey(self):
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
def AES_encrypt(self,text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text
- 获取评论数以及页数
def get_page(self,params, encSecKey):
json_text = self.get_json(params, encSecKey)
json_dict = json.loads(json_text.text)
total_comment = json_dict['total']
page = (total_comment / 20) + 1
print('***查询到评论共计%d条,%d页***' % (total_comment, page))
return page
- 完善代码,循环获取每页评论
如果想要热门评论的话,把comments改成hotcomments就好了~
# -*- coding: utf-8 -*-
import requests,json
from copyheaders import headers_raw_to_dict
from Crypto.Cipher import AES
import base64
# 网易云音乐
class Huangye():
def __init__(self):
self.url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_1399533630?csrf_token='
self.h = '''
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36
'''
h = bytes(self.h, encoding="utf-8")
self.headers = headers_raw_to_dict(h)
self.first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
self.second_param = "010001"
self.third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
self.forth_param = "0CoJUm6Qyw8W8jud"
def get_params(self,i):
if i == 0:
first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
else:
offset = str(i * 20)
first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' % (offset, 'flase')
iv = "0102030405060708"
first_key = self.forth_param
second_key = 16 * 'F'
#两次aes加密
h_encText = self.AES_encrypt(first_param, first_key, iv).decode('utf-8')
h_encText = self.AES_encrypt(h_encText, second_key, iv)
return h_encText
def get_encSecKey(self):
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
def AES_encrypt(self,text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text)
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text
def get_page(self,params, encSecKey):
json_text = self.get_json(params, encSecKey)
json_dict = json.loads(json_text.text)
total_comment = json_dict['total']
page = (total_comment / 20) + 1
print('***查询到评论共计%d条,%d页***' % (total_comment, page))
return page
# 访问
def get_json(self,params, encSecKey):
dataf = {
"params": params,
"encSecKey": encSecKey
}
sku_data = requests.post(url=self.url,data=dataf,headers=self.headers)
return sku_data
#获取热门评论
def hot_parse(self,json_data):
# 热门评论
for user in json_data['hotComments']:
# print(user)
# 用户名称
user_name = user['user']['nickname']
print('用户名称:',user_name)
# 评论内容
user_text = user['content'].replace('\r','\\r').replace('\n','\\n')
print('评论内容:',user_text)
# 该评论点赞数
user_num = user['likedCount']
print('该评论被点赞数:', user_num)
print('-'*30)
print('='*100)
# 获取全部评论
def parse(self,json_data):
# 全部评论
for user in json_data['comments']:
# print(user)
# 用户名称
user_name = user['user']['nickname']
print('用户名称:',user_name)
# 评论内容
user_text = user['content'].replace('\r','\\r').replace('\n','\\n')
print('评论内容:',user_text)
# 该评论点赞数
user_num = user['likedCount']
print('该评论被点赞数:', user_num)
print('-'*30)
# 主函数
def main(self):
params = self.get_params(0)
encSecKey = self.get_encSecKey()
page = self.get_page(params,encSecKey)
json_text = self.get_json(params, encSecKey)
json_dict = json.loads(json_text.text)
print('热门评论==============================================================')
self.hot_parse(json_dict)
print('全部评论==============================================================')
for i in range(int(page)):
params = self.get_params(i)
encSecKey = self.get_encSecKey()
json_text = self.get_json(params, encSecKey)
json_dict = json.loads(json_text.text)
self.parse(json_dict)
print('下一页。。。。。。。。。')
print('='*100)
if __name__ == '__main__':
jian = Huangye()
jian.main()
末
- 最后呢,可根据自己想法进行储存评论数据~
- 注意:大量采集需要使用代理ip,要不然会被封哟~
- 所以说,下载歌曲的同时,将歌曲id传过来,也是可以同时抓去评论的
- 说明一点,url的R_SO_4_后面一段数字就是歌曲id