要求:抓取喜马拉雅音频存储到本地 D:\temp_ximalaya_audio,并按节目和章节分类存储。
特殊说明:分析思路参考 https://www.jianshu.com/p/fc2e83c6583c
__author__ ='tony'
import json
import random
import time
import urllib.request
import pymongo
import requests
import aiohttp
import asyncio
from bs4import BeautifulSoup
from lxmlimport etree
import os
import shutil
filePath ="D:\\temp_ximalaya_audio"
channelFilePath =""
# 初始化文件目录
if os.path.isdir(filePath):
shutil.rmtree(filePath)# 递归删除目录树
elif os.path.isfile(filePath):
os.remove(filePath)# 删除文件
os.makedirs(filePath)# 创建目录
# mongodb
#clients = pymongo.MongoClient('localhost')
#db = clients["XiMaLaYa"]
#col1 = db["album2"]
#col2 = db["detaile2"]
UA_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers1 = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':'max-age=0',
'Proxy-Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1',
'User-Agent': random.choice(UA_LIST)
}
headers2 = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':'max-age=0',
'Proxy-Connection':'keep-alive',
'Referer':'http://www.ximalaya.com/dq/all/2',
'Upgrade-Insecure-Requests':'1',
'User-Agent': random.choice(UA_LIST)
}
def get_url():
#start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]
start_urls = ["http://www.ximalaya.com/dq/all/1/"]
print(start_urls)
for start_urlin start_urls:
print(start_url)
print("===============begin html=============")
html = requests.get(start_url,headers=headers1).text
print("html = {}".format(html))
print("===============end html=============")
print("===============begin soup=============")
soup = BeautifulSoup(html,'lxml')
print(soup)
print("===============end soup=============")
for itemin soup.find_all(class_="albumfaceOutter"):
print("================begin item================")
print(item)
print("================end item=========================")
print("================begin content================")
content = {
'href': item.a['href'],
'title': item.img['alt'],
'img_url': item.img['src']
}
print(content)
print("================end content=========================")
#col1.insert(content)
print('写入一个频道' + item.a['href'])
subchannel = item.a['href']
print("============begin subchannel===================")
print(subchannel)
subchannelArr = subchannel.split("/")
print(subchannelArr)
#channelFilePath = subchannelArr[len(subchannelArr) - 2]
channelFilePath = content['title']
print(channelFilePath)
channelFilePath = filePath + os.sep + channelFilePath
print(channelFilePath)
if os.path.isdir(channelFilePath):
shutil.rmtree(channelFilePath)# 递归删除目录树
elif os.path.isfile(channelFilePath):
os.remove(channelFilePath)# 删除文件
os.makedirs(channelFilePath)# 创建目录
print("============end subchannel===================")
print(content)
another(channelFilePath, item.a['href'])
time.sleep(1)
def another(channelFilePath, url):
print("=======================begin another html=======================")
html = requests.get(url,headers=headers2).text
print(html)
print("=======================end another html=======================")
print("=======================begin another ifanother=======================")
ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')
print(ifanother)
print("=======================end another ifanother=======================")
if len(ifanother):
num = ifanother[0]
print('本频道资源存在' + num +'个页面')
for nin range(1,int(num)):
print('开始解析{}个中的第{}个页面'.format(num, n))
url2 = url +'?page={}'.format(n)
print(url)
print(url2)
get_m4a(channelFilePath, url2)
get_m4a(url)
def get_m4a(channelFilePath, url):
time.sleep(1)
html = requests.get(url,headers=headers2).text
print("==============begin get_m4a====================")
numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')
print(numlist)
print("==============end get_m4a====================")
for iin numlist:
print("==============begin get_m4a murl====================")
murl ='http://www.ximalaya.com/tracks/{}.json'.format(i)
print(murl)
print(channelFilePath)
filePath_tracks = channelFilePath
'''
filePath_tracks = channelFilePath + os.sep + i
if os.path.isdir(filePath_tracks):
shutil.rmtree(filePath_tracks) # 递归删除目录树
elif os.path.isfile(filePath_tracks):
os.remove(filePath_tracks) # 删除文件
os.makedirs(filePath_tracks) # 创建目录
'''
print("==============begin get_m4a murl====================")
print("==============begin get_m4a html====================")
html = requests.get(murl,headers=headers1).text
print(html)
print("==============end get_m4a html====================")
print("==============begin get_m4a dic====================")
dic = json.loads(html)
print(dic)
print("==============end get_m4a dic====================")
print("==============begin get_m4a getdata====================")
imageUrl = dic["play_path"]
print(imageUrl)
imgData = urllib.request.urlopen(imageUrl).read()
print("==============end get_m4a getdata====================")
print("==============begin get_m4a savedata====================")
#iamgeUrlArr = imageUrl.split("/")
#imgFilePath = iamgeUrlArr[len(iamgeUrlArr) - 1]
postfixArr = imageUrl.split(".")
postfix = postfixArr[len(postfixArr) -1]
imgFilePath = dic['title'] +"." + postfix
imgFilePath = filePath_tracks + os.sep + imgFilePath# + getTimeStr() + ".jpg"
imageFile =open(imgFilePath,"wb")
imageFile.write(imgData)
imageFile.close()
print("==============end get_m4a savedata====================")
print("下载文件", imgFilePath,"成功,另存路径:" + imgFilePath)
print("==============end get_m4a dic====================")
#col2.insert(dic)
#print(murl + '中的数据已被成功插入mongodb')
if __name__ =='__main__':
print("begin")
get_url()
print("end")