python3 https://www.python.org
从官网下载安装或者用brew
$ brew linkapps python3
$ brew linkapps python3
pipenv https://github.com/pypa/pipenv
$ pip install pipenv
.zshrc
eval "$(pipenv --completion)"
request-html http://html.python-requests.org/en/latest/
$ git clone https://github.com/iOSDevLog/Machine-Learning-Crash-Course
$ cd Machine-Learning-Crash-Course
$ pipenv --python 3.6
$ pipenv install requests-html
$ pipenv shell # To activate this project's virtualenv
fetch_course.py
#!/usr/bin/env python
import os
import ssl
import time
from requests_html import HTMLSession
ssl._create_default_https_context = ssl._create_unverified_context
base_url = 'https://developers.google.com/machine-learning/crash-course/'
def course_info(course_url):
session = HTMLSession()
request = session.get(course_url)
data_video_url = ''
data_captions_url = ''
# video_info = request.html.find('.devsite-vplus', first=True)
# data_video_url = video_info.attrs['data-video-url']
# data_captions_url = video_info.attrs['data-captions-url']
next_url_info = request.html.find('div.devsite-steps-next > a.devsite-steps-link', first=True)
next_url = next_url_info.attrs['href']
return (data_video_url, data_video_url, next_url)
import urllib.request
def getHtml(url):
html = urllib.request.urlopen(url).read()
return html
def saveHtml(file_name, file_content):
dir = 'course_html/'
file_name = file_name.replace('/','_')+'.html'
path = os.path.join(dir, file_name)
with open (path, 'wb') as f:
f.write(file_content)
if __name__ == '__main__':
next_url = 'https://developers.google.com/machine-learning/crash-course/framing/check-your-understanding'
while next_url:
try:
(_, _, next_url) = course_info(next_url)
filename = os.path.basename(next_url)
html = getHtml(next_url)
saveHtml(filename, html)
print(next_url)
except:
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
data_video_url
为mp4视频相对地址
data_captions_url
为字幕相对地址
通过 base_url
可得到绝对地址,后面再写吧。