参考网页:
Python爬虫小白入门(六)爬取披头士乐队历年专辑封面-网易云音乐
selenium_python
项目内容:
这个项目是抓取网易云音乐上面陈奕迅所有专辑,主要是组装参考网页1的项目,加上一点异步IO而已
遇到问题:
1.以下是异步http获取文本内容,如果是获取二进制文件用下面的方法,官方文档
async with session.get('https://api.github.com/events') as resp:
print(await resp.text())
print(await resp.read())
项目源码:
from selenium import webdriver
from bs4 import BeautifulSoup
import asyncio,aiohttp
import os
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
CHENYIXUN_ALL_ALBUM_URL = 'https://music.163.com/#/artist/album?id=2116&limit=108&offset=0'
PATH = 'CHENYIXUN'
def mkdir(path):
path = path.strip()
if os.path.exists(path):
print('文件夹已经存在')
return False
else:
print('创建文件夹成功')
os.mkdir(path)
return True
async def save_img(url,img_name,sem):
# 要模拟浏览器登陆
headers = {'User-Agent': USER_AGENT}
with (await sem):
async with aiohttp.ClientSession() as client:
async with client.get(url, headers=headers) as resp:
assert resp.status == 200
img = await resp.read()
with open(img_name,'wb') as f:
print('成功下载文件{img_name}并保存'.format(img_name=img_name))
f.write(img)
def download_img(download_imgs_urls):
# 设置线程的信号量,最多5个协程在工作,根据网站的流量或者实际测试确定
# 如果没有进行限制,那么中途可能被封IP
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
tasks = [save_img(url,img_name,sem) for img_name,url in download_imgs_urls.items()]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
def main():
# 创建/读取文件夹
mkdir(PATH)
files_list = os.listdir(PATH)
os.chdir(PATH)
driver = webdriver.Chrome()
driver.get(CHENYIXUN_ALL_ALBUM_URL)
driver.switch_to.frame('g_iframe')
html = driver.page_source
# 链接处理
download_imgs_urls = {}
all_li = BeautifulSoup(html,'lxml').find(id='m-song-module').find_all('li')
for li in all_li:
album_img = li.find('img')['src']
album_img_url = album_img.split('?')[0]
album_name = li.find('div',class_='u-cover u-cover-alb3')['title']
album_data = li.find('span').get_text()
img_name = '{album_data}-{album_name}.jpg'.format(album_data = album_data,album_name = album_name.replace('/','').replace(':','').replace('?','').replace('\\','').replace('\"',''))
if img_name in files_list:
print('图片已经存在,不再下载')
else:
# print('{img_name}-{album_img_url}'.format(img_name=img_name,album_img_url=album_img_url))
download_imgs_urls[img_name] = album_img_url
files_list.append(img_name)
# 最后异步下载
download_img(download_imgs_urls)
if __name__ == '__main__':
main()