Wpsec-H4rdy http://www.h4rdy.me/post/1cba6492_6b61da3
Wpsec-p0di http://zone.wooyun.org/content/11888
#!/usr/bin/python
# -*- coding: utf-8 -*-
#author:Erie
import re
import sys
import requests
import Queue
import threading
from bs4 import BeautifulSoup
from urlparse import urljoin
from urlparse import urlparse
from urlparse import urlunparse
from posixpath import normpath
reload(sys)
sys.setdefaultencoding('utf-8')
sys.setdefaultencoding('gbk')
VisitedUrl = Queue.Queue()
VisitedLinks = []
#Spider Function()
class Spider(threading.Thread):
def __init__ (self,queue,links):
threading.Thread.__init__(self)
tmp = urlparse(links)
self.queue = queue
self.host = tmp.netloc
self.pro = tmp.scheme
self.path = tmp.path
#chongzu Url
def myjoin (self,base, url):
url1 = urljoin(base, url)
arr = urlparse(url1)
path = normpath(arr[2])
return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment))
def getpage (self,url):
Response = requests.get(url)
try:
Htmlpage = Response.content
soup = BeautifulSoup(Htmlpage)
all_href = soup.findAll("a")
for href in all_href:
tmp = str(href)
if tmp.find('href') != -1:
if href['href'].find("http://") != -1:
if urlparse(href['href']).hostname==self.host:
UnvisitedHref = href['href']
if UnvisitedHref not in VisitedLinks:
self.queue.put(UnvisitedHref)
else:
UnvisitedHref = self.myjoin(url,href['href'])
if UnvisitedHref not in VisitedLinks and urlparse(UnvisitedHref).hostname==self.host and (urlparse(UnvisitedHref).path.count('/')-self.path.count('/'))<=3 and UnvisitedHref.find('#') == -1 and UnvisitedHref.find('.js' )== -1 and UnvisitedHref.find('.jpg') == -1 and UnvisitedHref.find('bmp') == -1 and UnvisitedHref.find('.png') == -1 and UnvisitedHref.find('.gif') == -1:
self.queue.put(UnvisitedHref)
except :
pass
def run (self):
while True:
Urling =self.queue.get()
print Urling
VisitedLinks.append(Urling)
self.getpage(Urling)
self.queue.task_done()
#main Function()
def main(Url):
#Url = sys.argv[1]
for i in range(5):
spider = Spider(VisitedUrl,Url)
spider.start()
VisitedUrl.put(Url)
VisitedUrl.join()
if __name__ == "__main__":
main('http://www.sta.edu.cn')