import requests
from lxmlimport etree
import csv
import re
import json
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400',
'cookie':'kg_mid=***'
}
def get_info(url, writer):
res = requests.get(url,headers=headers)
html = etree.HTML(res.text)
infos = html.xpath('//div[@class="pc_temp_songlist pc_rank_songlist_short"]/ul/li')
for infoin infos:
rank1 = info.xpath('span[3]')[0]
rank = rank1.xpath('string(.)').strip()
name = info.xpath('a/text()')[0]
singer = name.split('-')[0]
song = name.split('-')[1]
time = info.xpath('span[5]/span/text()')[0].strip()
url_link = info.xpath('a/@href')[0]
res1 = requests.get(url_link,headers=headers)
for linein res1.text.split('\r'):
if 'jQuery' in line:
print(line)
if 'dataFromSmarty' in line:
hash = re.findall('"hash":"(.*?)",', line, re.S)[0]
album_id = re.findall('"album_id":(.*?)}', line, re.S)[0]
url_index ='https://wwwapi.kugou.com/yy/index.php?r=play/getdata&callback=jQuery19106328788476737324_1563785427610&hash={}&album_id={}'.format(
hash, album_id)
res2 = requests.get(url_index,headers=headers)
json_data = json.loads(re.match(".*?({.*}).*", res2.text).group(1))
# pprint.pprint(json_data)
play_url = json_data['data']['play_url']
print(rank, singer, song, time, play_url)
writer.writerow([rank, singer, song, time, play_url])
if __name__ =='__main__':
f =open('song.csv','w+',encoding='utf-8',newline='')
writer = csv.writer(f)
writer.writerow(['rank','singer','song','time','play_url'])
urls = ['https://www.kugou.com/yy/rank/home/{}-6666.html?from=rank'.format(str(i))for iin range(1,6)]
for urlin urls:
get_info(url, writer)
f.close()