前言
- 蛋肥学习了如何提升爬虫速度,打算分别尝试单线程爬虫、多线程爬虫、多进程爬虫、多协程爬虫来进行数据抓取,并对比其实际抓取速度。
准备
爬取时间:2021/03/10
系统环境:Windows 10
所用工具:Jupyter Notebook\Python 3.0
涉及的库:requests\lxml\selenium\time\threading\queue\multiprocessing\gevent\sys
获取网址信息
import requests
from lxml import etree
def getinfo(xpath):
url="https://hao.uisdc.com/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0"}
r=requests.get(url,headers=headers,timeout=10)
html=etree.HTML(r.text)
info=html.xpath(xpath)
return(info)
link=getinfo('//div[@class="item"]/a/@href')
title=getinfo('//div[@class="item"]/a/h3/text()')
获取网页截图
单线程爬虫
from selenium import webdriver
import time
def getshot(url,name):
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
start=time.time()
for i in range(len(url)):
try:
driver.get(url[i])
#等待页面加载完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\网页截图\img"+name[i]+".png")
except:
continue
end=time.time()
print("单线程爬虫所用时间:",end-start)
getshot(link,title)
多线程爬虫
参考资料
Python多线程
import threading
import time
import queue as Queue
from selenium import webdriver
start=time.time()
#截图函数,设置get的超时,以防一直取不到卡死
def getshot(name,url):
url=url.get(timeout=2)
picname=name+" "+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待页面加载完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\网页截图\img"+picname+".png")
driver.quit()
except:
print(name+"出错")
class myThread(threading.Thread):
def __init__(self,name,url):
threading.Thread.__init__(self)
self.name=name
self.url=url
def run(self):
while True:
try:
getshot(self.name,self.url)
except:
break
threadlist=["Thread-1","Thread-2","Thread-3","Thread-4","Thread-5"]
workQueue=Queue.Queue(200)
threads=[]
#创建新线程
for tName in threadlist:
thread=myThread(tName,workQueue)
thread.start()
threads.append(thread)
#填充队列
for i in range(len(link)):
workQueue.put(link[i])
#等待所有线程完成
for t in threads:
t.join()
end=time.time()
print("Queue多线程爬虫所用时间:",end-start)
多进程爬虫
参考资料
多进程在运行的时候只有一个子进程会运行,怎么解决
用python进行多进程编程时,只有主进程可以运行,子进程貌似没有运行是什么原因
面试总结,多进程和多线程的区别
#如果CPU是单核,就无法进行多进程并行,需要先了解计算机CPU的核心数量
from multiprocessing import cpu_count
print(cpu_count()) #蛋肥的电脑是8核
#Windows 以下代码需写成.py文件,然后用cmd启动(蛋肥用的Anaconda Powershell Prompt)
from multiprocessing import Process,Queue
import time
from selenium import webdriver
start=time.time()
#截图函数,设置get的超时,以防一直取不到卡死
def getshot(name,url):
url=url.get(timeout=2)
picname=name+" "+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待页面加载完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\网页截图\img"+picname+".png")
driver.quit()
except:
print(name+"出错")
class myProcess(Process):
def __init__(self,name,url):
Process.__init__(self)
self.name=name
self.url=url
def run(self):
while True:
try:
print(self.name)
getshot(self.name,self.url)
except:
break
#要写if,具体原因还没完全搞懂
if __name__=="__main__":
processlist=["Process-1","Process-2","Process-3","Process-4","Process-5"]
workQueue=Queue(200)
processes=[]
#填充队列
for i in range(len(link)):
workQueue.put(link[i])
#创建新进程
for pName in processlist:
process=myProcess(pName,workQueue)
processes.append(process)
for t in processes:
t.start()
for t in processes:
t.join()
end=time.time()
print("Queue多进程爬虫所用时间:",end-start)
多协程爬虫
参考资料
Python中gevent模块使用及出现MonkeyPatchWarning
Python的最大递归深度错误maximum recursion depth exceeded while calling a Python object
#monkey必须放在最前面,必须在获取网址信息代码(requests)的前面
import gevent
from gevent import monkey
monkey.patch_all()
#设置最大递归深度限制
import sys
sys.setrecursionlimit(1000000)
from gevent.queue import Queue,Empty
import time
from selenium import webdriver
start=time.time()
#截图函数,设置get的超时,以防一直取不到卡死
def getshot(index):
while not workQueue.empty():
url=workQueue.get(timeout=2)
picname="Process-"+str(index)+str(time.time())
driver=webdriver.Chrome(executable_path=r"C:\Users\Archer\AppData\Local\Google\Chrome\Application\chromedriver")
driver.maximize_window()
try:
driver.get(url)
#等待页面加载完成
time.sleep(1)
driver.save_screenshot(r"C:\Users\Archer\Desktop\网页截图\img"+picname+".png")
driver.quit()
except:
print("出错")
def boss():
#填充队列
for i in range(len(link)):
workQueue.put_nowait(link[i])
if __name__=="__main__":
workQueue=Queue(10000)
gevent.spawn(boss).join()
jobs=[]
for i in range(5):
jobs.append(gevent.spawn(getshot,i))
gevent.joinall(jobs)
end=time.time()
print("Queue多协程爬虫所用时间:",end-start)
爬取结果
进一步学习
总结
- 可通过多线程、多进程、多协程的方式提升数据爬取的速度,但需合理选择数量,一味地增加可能会适得其反。