之前用了scrapy框架谢了一个爬取链接的爬虫,但是不能获取到post的链接,ajax动态加载的链接,一些需要交互后才能得到的链接也没法获取到,因此想要采用spynner模拟浏览器请求页面,对请求过程中的数据包进行处理记录,同时采用spynner模拟浏览器进行一些简单的交互获取到其他链接。
01 spynner简介
spynner对QtWebkit进行了封装,使各种浏览器操作更加简单易用。webkit是一个开源的浏览器引擎。
详细情况可以参考
https://www.cnblogs.com/caroar/archive/2013/05/10/3070847.html
项目的地址:
https://github.com/makinacorpus/spynner
02 spynner操作
相关的代码在这里:https://github.com/M954/web_scan/tree/master/spider/crawl/tutorial
这里为了截取浏览器的请求,我用了两个方法,第一个方法比较简单粗暴,直接修改spynner的源码,在它的NManager中加入记录功能,然后加一个函数,用于将记录的请求返回出来
class NManager(QNetworkAccessManager):
ob = None # Browser instance
@classmethod
def new(klass, spynner, cookiejar_klass=None):
if not cookiejar_klass:
cookiejar_klass = ExtendedNetworkCookieJar
manager = klass()
manager.ob = spynner
manager.proxy_url = None
cookiejar = cookiejar_klass()
manager.setCookieJar(cookiejar)
manager.cookieJar().setParent(spynner.webpage)
# manager.request_urls = []
return manager
def createRequest(manager, operation, request, data):
self = manager.ob
jar = manager.cookieJar()
cookies = merge_cookies(
self.cookies,
jar.allCookies(),
)
manager.cookieJar().setAllCookies(cookies)
url = six.u(toString(request.url()))
operation_name = self._operation_names.get(
operation, str(operation)).upper()
req = self.make_request(request, operation_name)
self._debug(INFO, "Request: %s %s" % (operation_name, url))
for h in req.rawHeaderList():
self._debug(DEBUG, " %s: %s" % (h, req.rawHeader(h)))
if self._url_filter:
if self._url_filter(self._operation_names[operation], url) is False:
self._debug(INFO, "URL filtered: %s" % url)
req.setUrl(QUrl("about:blank"))
else:
self._debug(DEBUG, "URL not filtered: %s" % url)
reply = QNetworkAccessManager.createRequest(
manager, operation, req, data)
request_url = request.url()
request_item = {}
# print request_url.toString()
request_item['rawurl'] = str(request_url.toString())
request_item['domain'] = str(request_url.host())
request_item['url'] = str(request_url.toString(request_url.RemoveQuery))
request_item['method'] = operation_name
request_item['port'] = request_url.port()
query_items = {}
if request_url.hasQuery():
query_raw = request_url.queryItems()
for q in query_raw:
if len(q) >= 2:
query_items[str(q[0]).replace("\"", "\\\"")] = str(q[1]).replace("\"", "\\\"")
else:
query_items[str(q[0]).replace("\"", "\\\"")] = ""
request_item['query'] = query_items
request_data = ''
if data != None:
request_data = request_item['method'] + ' ' + request_url.path() + ' ' + 'HTTP/1.1' + '\r\n'
request_data += "Host: " + request_url.host() + '\r\n'
for h in req.rawHeaderList():
request_data += "%s: %s" % (h, req.rawHeader(h)) + '\r\n'
raw_cookies = "Cookie: "
cnt = 0
for c in cookies:
raw_cookies += c.toRawForm().data()
cnt += 1
if cnt != len(cookies):
raw_cookies += "; "
request_data += raw_cookies + '\r\n'
request_data += '\r\n'
if str(type(data)) == "<class 'PyQt4.QtCore.QByteArray'>":
request_data += str(data)
elif str(type(data)) == "<class 'PyQt4.QtCore.QIODevice'>":
request_data += str(data.readAll())
elif str(type(data)) == "<class 'PyQt4.QtCore.QHttpMultiPar'>":
request_data += str(data.boundary())
# print '[spynner] ' + str(request_data)
request_item["data"] = request_data
print str(datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')) + ' [spynner] ' + request_item['rawurl']
self.request_urls.append(request_item)
return reply
......
def get_request_urls(self):
return self.request_urls
第二个方法就比较正常一点,继承browser和NManager,在其中添加上记录的变量,同样添加一个可以取出记录值的函数就好了。
3 交互
主要是获取页面中可以交互的组件,交互后获取交互期间的链接数据
def get_click_point(self):
res = []
elements = self.browser.webframe.findAllElements('*[onclick]')
res = elements.toList()
#elements = self.browser.webframe.findAllElements('input')
#res = res + elements.toList()
#elements = self.browser.webframe.findAllElements('a')
#res = res + elements.toList()
felements = self.browser.webframe.findAllElements('form')
for fele in felements.toList():
fchildele = fele.findAll('*')
res = res + fchildele.toList()
return res
def click(self):
while( len(self.click_point) > 0):
e = self.click_point[0]
self.click_point.remove(e)
if usersetting.Maxclick >= 0 and cnt >= usersetting.Maxclick:
print datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S") + ' [click] stop because reach max click number'
break
# print e.tagName()
try:
self.browser.wk_click_element(e, wait_load=True, timeout=0.5)
except:
self.browser.wk_click_element(e)
self.analyse_page()
self.cnt += 1.0
print datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S") + ' [click] ' + "%.2f"%(self.cnt/len(self.click_point) * 100) + '%'