原理:通过selenium获取地址信息,再通过You-get批量下载
from selenium import webdriver
#from msedge.selenium_tools import EdgeOptions
#from msedge.selenium_tools import Edge
from selenium.webdriver.common.by import By
import time
edge_options = webdriver.EdgeOptions()
edge_options.use_chromium = True
# 设置无界面模式,也可以添加其它设置
edge_options.add_argument('--ignore-certificate-errors')
edge_options.add_argument('--ignore-ssl-errors')
edge_options.add_argument('--enable-chrome-browser-cloud-management')
#edge_options.add_argument('--headless')
# 还是有界面更好用些
driver = webdriver.Edge(options=edge_options)
#driver = Edge(options=edge_options)
driver.get('https://tv.cctv.com/lm/gzsbqlx/index.shtml')
for i in range(100):
driver.execute_script(f'document.documentElement.scrollTop={(i+1)*1000}')
time.sleep(2)
with open("yscj.txt",'w', encoding='utf-8') as f:
page_source = driver.page_source
f.write(page_source)
driver.quit()
再通过正则表达式获取地址,生成批量下载bat
import re
read_data = ""
with open('yscj.txt',encoding='utf-8') as f:
read_data = f.read()
gzsb=r"(https://tv.cctv.com/\d{2}(?P<year>\d{2})/(?P<month>\d{2})/" \
"(?P<day>\d{2})/[A-Za-z0-9]{24}(?P=year)(?P=month)(?P=day).shtml)"
tempres=re.findall(gzsb,read_data)
gzsblist=[]
for tempaddr in tempres:
if not tempaddr[0] in gzsblist:
gzsblist.append(tempaddr[0])
gzsblist = sorted(gzsblist,reverse=True)
with open("yscj3.txt",'w+', encoding='utf-8') as f:
for li in gzsblist:
f.write("you-get "+ li + "\n")
或者使用bs解析出地址,生成you-get脚本
from bs4 import BeautifulSoup
read_data = ""
with open('yscj.txt',encoding='utf-8') as f:
read_data = f.read()
soup = BeautifulSoup(read_data, 'lxml')
info = soup.find(id='boxList')
li_content = info.find_all('li')
with open("yscj2.txt",'w', encoding='utf-8') as f:
for li in li_content:
#link = li["href"]
liimage = li.find('a')['href']
f.write("you-get "+ liimage + "\n")