抓取页面编码格式
import requests
def monkey_patch():
prop = requests.models.Response.content
def content(self):
_content = prop.fget(self)
if self.encoding == 'ISO-8859-1':
encodings = requests.utils.get_encodings_from_content(_content)
if encodings:
self.encoding = encodings[0]
else:
self.encoding = self.apparent_encoding
_content = _content.decode(self.encoding, 'replace').encode('utf8', 'replace')
self._content = _content
return _content
requests.models.Response.content = property(content)
monkey_patch()
批量处理url格式
import urlparse
f = open('new_url.txt','w+')
with open('b.txt') as b:
for i in b.readlines():
i = i.strip('\r').strip('\n')
url = urlparse.urlparse(i)
url = url.scheme + "://" + url.hostname
print url
f.write(url)
f.write('\n')
查找是否存在某个字符串中
import urllib2
#----------------------------------------------------------------------
def waf_url(url):
""""""
resp = urllib2.urlopen('http://www.xxxxx.cn')
print str(resp.headers)
print resp.headers['Server']
if str(resp.headers).upper().find('date'.upper()) == -1:
print 'NO'
else:
print 'yes'
wwwww = 'http://www.22222.com'
if "2" in wwwww:
print('111111')