本期将针对第3期的5项升级,封装一个简单的爬虫工具
import urllib.request
from urllib.error import HTTPError
from urllib import robotparser
import ssl
class XiaSpider(object):
def __init__(self, user_agent="lsp", robots=True):
self.user_agent = user_agent
self.robots = robots
self.context = ssl._create_unverified_context()
def get_url(self, request, retries):
try:
content = urllib.request.urlopen(request, context=self.context).read()
except HTTPError as e:
if 500 <= e.code < 600:
if retries > 0:
return self.get_url(request, retries - 1)
print("发生错误:", e)
return False
except Exception as e:
print("发生错误:", e)
return False
else:
return content
def get_robots(self, url, robots_url):
if not robots_url:
urll = url.split("/")
if len(urll) >= 3:
robots_url = urll[0] + "//" + urll[2] + "/robots.txt"
else:
return True
rp = robotparser.RobotFileParser(robots_url)
rp.read()
if rp.can_fetch(self.user_agent, url):
return True
else:
print("根据robots协议,该网站:%s不可访问" % url)
return False
def get(self, url, retries=3, robots_url=None):
if self.robots:
if not self.get_robots(url, robots_url):
return None
request = urllib.request.Request(url)
request.add_header("User-Agent", self.user_agent)
content = self.get_url(request, retries)
if content:
return content
else:
return None
通过对象的get方法,即可轻松的按照之前升级的5项,完成一个爬虫程序