多线程爬虫
有时候,比如下载图片,因为下载图片是一个耗时的操作。如果采用同步的方式下载,那么效率肯定会特别慢,这时候我们就可以考虑使用多线程的方式下载
多线程的介绍
多线程的讲解请参考:https://coding.imooc.com/class/200.html
注意:由于python中的GIL的存在,所以多线程不能使用多核CPU的优势,但是在处理I/O读写和网络请求方面,仍然能够提高处理的性能
爬取斗图啦中的最新表情:http://www.doutula.com/photo/list/
单线程爬取斗图啦里的表情包
import requests
from urllib import request
import os
import re
from lxml import etree
import time
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'}
response = requests.get(url,headers=headers)
text = response.text
html = etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
img_url = img.get('data-original')
alt = img.get('alt')
alt = re.sub(r'[\??\.,。!!]','',alt)
suffix = os.path.splitext(img_url)[1]
filename = alt + suffix
print(filename)
request.urlretrieve(img_url,'images/'+filename)
def main():
for x in range(1,2):
url = 'http://www.doutula.com/photo/list/?page={}'.format(x)
parse_page(url)
if __name__ == '__main__':
time_start = time.time()
main()
time_end = time.time()
print(time_end-time_start)
多线程爬取斗图啦里的表情包
生产者:获取表情图片的url
消费者:下载表情图片
import requests
import threading
from queue import Queue
from urllib import request
import os
import re
from lxml import etree
import time
class Producer(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Producer, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
# if self.page_queue.empty():
# break
url = self.page_queue.get()
self.parse_page(url)
def parse_page(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'}
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
img_url = img.get('data-original')
alt = img.get('alt')
alt = re.sub(r'[\??\.,。!!\*]', '', alt)
suffix = os.path.splitext(img_url)[1]
filename = alt + suffix
self.img_queue.put((img_url,filename))
class Consumer(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Consumer,self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
# if self.img_queue.empty() and self.page_queue.empty():
# break
img_url,filename = self.img_queue.get()
request.urlretrieve(img_url, 'images/' + filename)
def main():
page_queue = Queue(100)
img_queue = Queue(1000)
for i in range(1,5):
url = 'http://www.doutula.com/photo/list/?page={}'.format(i)
page_queue.put(url)
for x in range(5):
t = Producer(page_queue,img_queue)
t.start()
for y in range(5):
t = Consumer(page_queue,img_queue)
t.start()
if __name__ == '__main__':
time_start = time.time()
main()
time_end = time.time()
print(time_end - time_start)