image.png
#coding=utf-8
import re,urllib2,os,urllib,requests
def getHtmlCode(url):
response = urllib2.urlopen(url)
return response.read()
def getEntityId(htmlString):
regEntityId = re.compile("\"(.+?)\",\"goods_id")
return regEntityId.findall(htmlString)
def getMp3Url(htmlString):
regMp3Url = re.compile("mp3\",\"url\":\"(.+?).mp3")
return regMp3Url.findall(htmlString)
def getTitle(htmlString):
regTitle = re.compile("\"title\":\"(.+?)\"")
return regTitle.findall(htmlString)
def getPublishTime(htmlString):
regUpdateTime = re.compile("published_at\":\"(.+?) ")
return regUpdateTime.findall(htmlString)
if __name__ == '__main__':
url = 'http://36kr.com/user/947181171'
htmlCode = getHtmlCode(url)
entityIds = getEntityId(htmlCode)
for content in entityIds:
contentSplitArray = content.split("\"")
entityId = contentSplitArray[len(contentSplitArray) - 1]
contentUrl = 'http://36kr.com/p/' + entityId + '.html'
contentHtmlCode = getHtmlCode(contentUrl)
mp3Urls = getMp3Url(contentHtmlCode)
titles = getTitle(contentHtmlCode)
publishTimes = getPublishTime(contentHtmlCode)
fileName = publishTimes[0] + ' ' + titles[0]
# if len(fileName) > 35:
# fileName = fileName[0:35]
print fileName
mp3Url = mp3Urls[0] + '.mp3'
print mp3Url
if os.path.isfile('/Users/wuxueying/Desktop/python/audios/' + fileName + '.mp3') == False:
urllib.urlretrieve(mp3Url,"%s.mp3" %("audios/" + fileName))
print (fileName + '------done')