登录注册写文章

抓取网站音频

抓取网站音频

要求：抓取喜马拉雅音频存储到本地 D:\temp_ximalaya_audio，并按节目和章节分类存储。

特殊说明：分析思路参考 https://www.jianshu.com/p/fc2e83c6583c

__author__ ='tony'

import json

import random

import time

import urllib.request

import pymongo

import requests

import aiohttp

import asyncio

from bs4import BeautifulSoup

from lxmlimport etree

import os

import shutil

filePath ="D:\\temp_ximalaya_audio"

channelFilePath =""

# 初始化文件目录

if os.path.isdir(filePath):

shutil.rmtree(filePath)# 递归删除目录树

elif os.path.isfile(filePath):

os.remove(filePath)# 删除文件

os.makedirs(filePath)# 创建目录

# mongodb

#clients = pymongo.MongoClient('localhost')

#db = clients["XiMaLaYa"]

#col1 = db["album2"]

#col2 = db["detaile2"]

UA_LIST = [

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

]

headers1 = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

'Cache-Control':'max-age=0',

'Proxy-Connection':'keep-alive',

'Upgrade-Insecure-Requests':'1',

'User-Agent': random.choice(UA_LIST)

}

headers2 = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate, sdch',

'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',

'Cache-Control':'max-age=0',

'Proxy-Connection':'keep-alive',

'Referer':'http://www.ximalaya.com/dq/all/2',

'Upgrade-Insecure-Requests':'1',

'User-Agent': random.choice(UA_LIST)

}

def get_url():

#start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]

start_urls = ["http://www.ximalaya.com/dq/all/1/"]

print(start_urls)

for start_urlin start_urls:

print(start_url)

print("===============begin html=============")

html = requests.get(start_url,headers=headers1).text

print("html = {}".format(html))

print("===============end html=============")

print("===============begin soup=============")

soup = BeautifulSoup(html,'lxml')

print(soup)

print("===============end soup=============")

for itemin soup.find_all(class_="albumfaceOutter"):

print("================begin item================")

print(item)

print("================end item=========================")

print("================begin content================")

content = {

'href': item.a['href'],

'title': item.img['alt'],

'img_url': item.img['src']

}

print(content)

print("================end content=========================")

#col1.insert(content)

print('写入一个频道' + item.a['href'])

subchannel = item.a['href']

print("============begin subchannel===================")

print(subchannel)

subchannelArr = subchannel.split("/")

print(subchannelArr)

#channelFilePath = subchannelArr[len(subchannelArr) - 2]

channelFilePath = content['title']

print(channelFilePath)

channelFilePath = filePath + os.sep + channelFilePath

print(channelFilePath)

if os.path.isdir(channelFilePath):

shutil.rmtree(channelFilePath)# 递归删除目录树

elif os.path.isfile(channelFilePath):

os.remove(channelFilePath)# 删除文件

os.makedirs(channelFilePath)# 创建目录

print("============end subchannel===================")

print(content)

another(channelFilePath, item.a['href'])

time.sleep(1)

def another(channelFilePath, url):

print("=======================begin another html=======================")

html = requests.get(url,headers=headers2).text

print(html)

print("=======================end another html=======================")

print("=======================begin another ifanother=======================")

ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')

print(ifanother)

print("=======================end another ifanother=======================")

if len(ifanother):

num = ifanother[0]

print('本频道资源存在' + num +'个页面')

for nin range(1,int(num)):

print('开始解析{}个中的第{}个页面'.format(num, n))

url2 = url +'?page={}'.format(n)

print(url)

print(url2)

get_m4a(channelFilePath, url2)

get_m4a(url)

def get_m4a(channelFilePath, url):

time.sleep(1)

html = requests.get(url,headers=headers2).text

print("==============begin get_m4a====================")

numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')

print(numlist)

print("==============end get_m4a====================")

for iin numlist:

print("==============begin get_m4a murl====================")

murl ='http://www.ximalaya.com/tracks/{}.json'.format(i)

print(murl)

print(channelFilePath)

filePath_tracks = channelFilePath

'''

filePath_tracks = channelFilePath + os.sep + i

if os.path.isdir(filePath_tracks):

shutil.rmtree(filePath_tracks) # 递归删除目录树

elif os.path.isfile(filePath_tracks):

os.remove(filePath_tracks) # 删除文件

os.makedirs(filePath_tracks) # 创建目录

'''

print("==============begin get_m4a murl====================")

print("==============begin get_m4a html====================")

html = requests.get(murl,headers=headers1).text

print(html)

print("==============end get_m4a html====================")

print("==============begin get_m4a dic====================")

dic = json.loads(html)

print(dic)

print("==============end get_m4a dic====================")

print("==============begin get_m4a getdata====================")

imageUrl = dic["play_path"]

print(imageUrl)

imgData = urllib.request.urlopen(imageUrl).read()

print("==============end get_m4a getdata====================")

print("==============begin get_m4a savedata====================")

#iamgeUrlArr = imageUrl.split("/")

#imgFilePath = iamgeUrlArr[len(iamgeUrlArr) - 1]

postfixArr = imageUrl.split(".")

postfix = postfixArr[len(postfixArr) -1]

imgFilePath = dic['title'] +"." + postfix

imgFilePath = filePath_tracks + os.sep + imgFilePath# + getTimeStr() + ".jpg"

imageFile =open(imgFilePath,"wb")

imageFile.write(imgData)

imageFile.close()

print("==============end get_m4a savedata====================")

print("下载文件", imgFilePath,"成功,另存路径:" + imgFilePath)

print("==============end get_m4a dic====================")

#col2.insert(dic)

#print(murl + '中的数据已被成功插入mongodb')

if __name__ =='__main__':

print("begin")

get_url()

print("end")

©著作权归作者所有,转载或内容合作请联系作者

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 216,258评论 6赞 498
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 92,335评论 3赞 392
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 162,225评论 0赞 353
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 58,126评论 1赞 292
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 67,140评论 6赞 388
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 51,098评论 1赞 295
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 40,018评论 3赞 417
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 38,857评论 0赞 273
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 45,298评论 1赞 310
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,518评论 2赞 332
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 39,678评论 1赞 348
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 35,400评论 5赞 343
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 40,993评论 3赞 325
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,638评论 0赞 22
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,801评论 1赞 268
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 47,661评论 2赞 368
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 44,558评论 2赞 352

推荐阅读更多精彩内容

为什么钱那么重要
曾经，我对钱不屑一顾，觉得那是世界上最污秽的东西。有句俗话“有钱不是万能的，但没钱是万万不能的，我极其同意这句话...
爱吃麻辣烫的奥特曼阅读 1,361评论 0赞 1
2017-12-23
✔关于通信行业和中年危机，答《三联生活周刊》采访 -- 邓志强 ✔http://mp.weixin.qq.com/...
大雨不愁阅读 313评论 0赞 0
每日一问
第四天，你为什么离开上一家公司？工作俩年多，得不到领导的认可。在旅游旺季，每个部门都辛苦加班的时候，把...
流年七里香农庄阅读 118评论 0赞 0
聚干货：创业公司如何正确有效的开会？
（原图来源：scottberkun.com，标点符译）创业团队开会应该是这样的：所有人一起讨论、解决那些最核心的...
聚创阅读 625评论 0赞 2

1赞2赞

赞赏

手机看全文