coding:utf-8
import requests
import threading #多线程
from lxml import etree #解析网页
from bs4 import BeautifulSoup #页面提取
输出的内容编码是utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def index_url(url):
#加上header是为了防止网站反爬虫的机制 我们使用代理模拟浏览器
#页面的header User-Agent是代理 当前页面的 注:所有User-Agent都是一样的
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
html=requests.get(url,header).content
return html
获取每个图片链接
def get_img(html):
soup = etree.HTML(html)
items=soup.xpath('//div[@class="artile_des"]')
for item in items:
imgurl_list=item.xpath('table/tbody/tr/td/a/img/@onerror')
#print imgurl_list
start_save_img(imgurl_list)
#sys.exit()
获取详情页a标签链接
def get_img_html(html):
#创建BeautifulSoup对象
soup = BeautifulSoup(html,'lxml')
#查找a标签 class是关键字使用要加上下划线class_
a_all=soup.find_all('a',class_="list-group-item")
for i in a_all:
img_html=index_url(i['href'])
get_img(img_html)
下载图片
def save_img(img_url):
img_url = img_url.split('=')[-1].replace(''','')
img_url = 'http:'+img_url
# print 123
# sys.exit()
# print img_url
# sys.exit()
img_content = requests.get(img_url).content
#print img_content.status_code
save_path='./doutu/'+img_url.split('/')[-1]
# print save_path
# sys.exit()
with open(save_path, 'wb') as f:
print u'正在下载'+img_url.split('/')[-1]
f.write(img_content)
多线程 调用下载图片方法
def start_save_img(imgurl_list):
for i in imgurl_list:
th=threading.Thread(target = save_img,args=(i,))
th.start()
def main():
for i in range(1,2):
url = 'https://www.doutula.com/article/list/?page='+str(i)
starthtml = index_url(url)
get_img_html(starthtml)
if name == 'main':
main()