分析函数,匹配歌曲的作者-歌曲名,歌曲链接,歌曲时长等信息,使用xpath
import requests
from lxml import etree
import json
import pymysql
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
def get_info(url):
res=requests.get(url,headers=headers)
xpath_data=etree.HTML(res.text)
# print(xpath_data)
tag=xpath_data.xpath('//div[@class="pc_temp_songlist "]')
# print(tag)
singers_songs=xpath_data.xpath('.//a[@class="pc_temp_songname"]/text()')
# print(singers_songs)
songhrefs=xpath_data.xpath('.//a[@class="pc_temp_songname"]/@href')
# print(songhrefs)
songtimes=xpath_data.xpath('.//span[@class="pc_temp_time"]/text()')
# print(songtimes)
songlist=[]
for song_name,songhref,songtime in zip(singers_songs,songhrefs,songtimes):
data={}
data['song_name']=song_name,
data['songhref']=songhref,
data['songtime']=songtime.strip(),
songlist.append(data)
return songlist
匹配到数据后写入到html文件
def w_html(data):
with open('kougou红歌榜.html','a+') as f:
for d in data:
print((str('<a'+' href='+'"'+str(d['songhref'][0])+'"'+' target="_blank"'+'>'+str(d['song_name'][0])+'</a>')))
f.write((str('<a'+' href='+'"'+str(d['songhref'][0])+'"'+' target="_blank"'+'>'+str(d['song_name'][0])+'</a>'))+'<br>')
也可以写入数据库,此处使用mysql测试,
先创建表
mysql> create table kugou_songs(
-> id int auto_increment primary key,
-> songname varchar(30) not null,
-> songhref varchar(500) ,
-> times varchar(20));
Query OK, 0 rows affected (0.02 sec)
连接数据库并写入数据
def w_mysql(data):
try:
# 1.链接 数据库 链接对象 connection()
conn=pymysql.connect(
host='127.0.0.1',
port=3306,
db='jt',
user='root',
passwd='123456',
# vharset='utf8'
)
# 2. 创建 游标对象 cursor()
cur=conn.cursor()
for d in data:
# print(d)
song_name=d["song_name"][0]
songhref=d["songhref"][0]
songtime=d["songtime"][0]
# print(type(song_name),type(str(song_name)),song_name)
# print(songhref)
# print(songtime)
# insert_sub='insert into kugou_songs values(0,d["song_name"][0],d["songhref"][0],d["songtime"][0])'
insert_sub='insert into kugou_songs values(0,'+'"'+str(song_name)+'"'+','+'"'+str(songhref)+'"'+','+'"'+str(songtime)+'"'+')'
print(insert_sub)
cur.execute(insert_sub)
except Exception as e:
print(e)
finally:
# 提交事务
conn.commit()
# 关闭游标
cur.close()
# 关闭链接
conn.close()
构造url,测试了前5页
if __name__ == '__main__':
urls=['https://www.kugou.com/yy/rank/home/{}-23784.html?from=rank'.format(n) for n in range(1,6) ]
for url in urls:
data=get_info(url)
# print(data)
# w_mysql(data)
w_html(data)
效果截图
写入的html文件,点击即可跳转到相应歌曲页面
写入的数据库文件
思考,再复杂点的就是获取热门榜单的所有栏目的链接,然后根据链接去抓取不同栏目的歌曲排行。此处没有再继续测试。
完整代码:
import requests
from lxml import etree
import json
import pymysql
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
def get_info(url):
res=requests.get(url,headers=headers)
xpath_data=etree.HTML(res.text)
# print(xpath_data)
tag=xpath_data.xpath('//div[@class="pc_temp_songlist "]')
# print(tag)
singers_songs=xpath_data.xpath('.//a[@class="pc_temp_songname"]/text()')
# print(singers_songs)
songhrefs=xpath_data.xpath('.//a[@class="pc_temp_songname"]/@href')
# print(songhrefs)
songtimes=xpath_data.xpath('.//span[@class="pc_temp_time"]/text()')
# print(songtimes)
songlist=[]
for song_name,songhref,songtime in zip(singers_songs,songhrefs,songtimes):
data={}
data['song_name']=song_name,
data['songhref']=songhref,
data['songtime']=songtime.strip(),
songlist.append(data)
return songlist
def w_html(data):
with open('kougou红歌榜.html','a+') as f:
for d in data:
print((str('<a'+' href='+'"'+str(d['songhref'][0])+'"'+' target="_blank"'+'>'+str(d['song_name'][0])+'</a>')))
f.write((str('<a'+' href='+'"'+str(d['songhref'][0])+'"'+' target="_blank"'+'>'+str(d['song_name'][0])+'</a>'))+'<br>')
def w_mysql(data):
try:
# 1.链接 数据库 链接对象 connection()
conn=pymysql.connect(
host='127.0.0.1',
port=3306,
db='jt',
user='root',
passwd='123456',
# vharset='utf8'
)
# 2. 创建 游标对象 cursor()
cur=conn.cursor()
# 增加一条数据 stu
for d in data:
# print(d)
song_name=d["song_name"][0]
songhref=d["songhref"][0]
songtime=d["songtime"][0]
# print(type(song_name),type(str(song_name)),song_name)
# print(songhref)
# print(songtime)
# insert_sub='insert into kugou_songs values(0,d["song_name"][0],d["songhref"][0],d["songtime"][0])'
insert_sub='insert into kugou_songs values(0,'+'"'+str(song_name)+'"'+','+'"'+str(songhref)+'"'+','+'"'+str(songtime)+'"'+')'
print(insert_sub)
cur.execute(insert_sub)
except Exception as e:
print(e)
finally:
# 提交事务
conn.commit()
# 关闭游标
cur.close()
# 关闭链接
conn.close()
if __name__ == '__main__':
urls=['https://www.kugou.com/yy/rank/home/{}-23784.html?from=rank'.format(n) for n in range(1,6) ]
for url in urls:
data=get_info(url)
# print(data)
# w_mysql(data)
w_html(data)