涉及
lxml模块的css选择器
requests库
threading多线程
测试环境为python 2.7
#coding=utf8
import re
import os
import lxml.html
import requests
import threading
import time
def get_url(url):
html = requests.get(url).content
soup = lxml.html.fromstring(html)
src_list = soup.cssselect('a.col-xs-6.col-sm-3') #css选择器选择图片class
url_list = []
title_list = []
for src in src_list:
img_url = re.compile('data-original="(http:)?//(.*?)"').search(lxml.html.tostring(src)).group(2) #获取图片class中的图片url
url_list.append(img_url)
title = src.text_content().replace('\n','').replace(' ', '').strip() #获取图片标题
title_list.append(title)
start_save_img(url_list,title_list)
def save_img(img_url,title):
img_url = 'http://'+img_url
img_format = re.compile('\.jpg|\.png|.gif').search(img_url).group()
img_content = requests.get(img_url).content
print '正在下载'+ img_url
if not os.path.exists('./img'):
os.mkdir('img')
with open('img/{}{}'.format(title.encode('utf-8'), img_format), 'wb') as f:
f.write(img_content)
def start_save_img(url_list,title_list):
for i in range(len(url_list)):
th = threading.Thread(target=save_img,args=(url_list[i],title_list[i])) #多线程下载
th.start() #开启多线程
def start():
for i in range(1,10):
url = 'https://www.doutula.com/photo/list/?page={}'.format(i)
get_url(url)
time.sleep(3)
if __name__ == '__main__':
start()