大师兄的Python学习笔记(二十四): 爬虫(五)
大师兄的Python学习笔记(二十六): 爬虫(七)
七、识别验证码
1. 识别简单图形验证码
-
通常由4位字母或数字组成。
- 需要使用
tesserocr
库做图像文字识别。
1.1 在windows下安装tesserocr库
-
tesserocr
库在windows下需要tesseract
的支持,所以需要先安装tesseract
, 点击此处下载并安装。 - 将
tesseract
添加到环境变量。
- 点击下载tesserocr库
- 使用
pip install ./tesserocr-2.4.0-cp37-cp37m-win_amd64.whl
安装,具体位置根据你的文件位置调整。
1.2 识别方法
1) tesserocr.image_to_text(image)
- 识别图形中的文字。
2) tesserocr.file_to_text(image)
- 识别图片文件中的文字,效果不如
image_to_text(image)
。>>>import tesserocr >>>import os >>>from PIL import Image >>>path = os.path.join("d:\\","sample_code","Graphical_verification_code.jpg") >>>image = Image.open(path) >>>image = image.convert('L') # 转为灰度图像 >>>result = tesserocr.image_to_text(image) >>>print(result) D5Qe
2. 识别极验验证码
- 极验验证码是现在大部分网站的验证方式https://www.geetest.com/。
- 主要以点按、滑动、选字、选图、识字组词等方式验证。
- 可以尝试使用Selenium库模拟页面行为通过验证。
- 极验验证码是在不停升级的,所谓道高一尺魔高一丈...
2.1 识别环境
以极客官网后台登录https://auth.geetest.com/login为例,需要考虑三种情况:
- 模拟点击
- 滑动拼图到缺口
- 模拟托块滑动
2.2 模拟点击
-
模拟点击比较简单,直接用Selenium包模拟点击即可。
>>>from selenium import webdriver
>>>from selenium.webdriver.support.wait import WebDriverWait
>>>from selenium.webdriver.support import expected_conditions as EC
>>>from selenium.webdriver.common.by import By
>>>class Geetest_sample():
>>> def __init__(self,user_name,password,url='https://auth.geetest.com/login'):
>>> self.url = url
>>> self.user_name = user_name
>>> self.password = password
>>> self.browser = webdriver.Firefox()
>>> self.wait = WebDriverWait(self.browser,10)
>>> def get_button(self):
>>> # 定位按键验证码元素
>>> button = >>>self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_radar_tip_content')))
>>> return button
>>> def sort_username_password(self):
>>> # 输入用户名密码
>>> input_username = self.browser.find_element(By.CSS_SELECTOR,'.ivu-input')
>>> input_password = self.browser.find_element(By.CSS_SELECTOR,'[placeholder~=请输入密码]')
>>> input_username.send_keys(self.user_name)
>>> input_password.send_keys(self.password)
>>> def crack_geek(self):
>>> # 执行点击
>>> self.browser.get(self.url)
>>> self.sort_username_password()
>>> button = self.get_button()
>>> button.click()
>>>if __name__ == '__main__':
>>> gs = Geetest_sample('test','test')
>>> gs.crack_geek()
2.3 滑动拼图到缺口
- 关键在于使用边缘检测算法找到缺口位置,之后将滑块移动到缺口位置。
-
滑块动作需要模拟人的动作,比如先加速再减速。
>>>from selenium import webdriver
>>>from selenium.webdriver import ActionChains
>>>from selenium.webdriver.support.wait import WebDriverWait
>>>from selenium.webdriver.support import expected_conditions as EC
>>>from selenium.webdriver.common.by import By
>>>from PIL import Image
>>>from io import BytesIO
>>>import time
>>>class Geetest_sample():
>>> def __init__(self,user_name,password,url='https://auth.geetest.com/login'):
>>> self.url = url
>>> self.user_name = user_name
>>> self.password = password
>>> self.browser = webdriver.Firefox()
>>> self.wait = WebDriverWait(self.browser,10)
>>> def get_button(self):
>>> button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_radar_tip_content')))
>>> return button
>>> def sort_username_password(self):
>>> input_username = self.browser.find_element(By.CSS_SELECTOR,'.ivu-input')
>>> input_password = self.browser.find_element(By.CSS_SELECTOR,'[placeholder~=请输入密码]')
>>> input_username.send_keys(self.user_name)
>>> input_password.send_keys(self.password)
>>> def get_screenshot(self):
# 获得完整的图片
>>> screenshot = self.browser.get_screenshot_as_png()
>>> screenshot = Image.open(BytesIO(screenshot))
>>> return screenshot
>>> def get_img_position(self):
>>> # 获取验证码图片
>>> img = self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'geetest_canvas_img'))) # 如果图片验证码出现
>>> time.sleep(2) # 模拟人的反应
>>> location = img[0].location
>>> size = img[0].size
>>> top,bottom,left,right = location.get('y'),location.get('y') + size.get('height'),location.get('x'),location.get('x') + size.get('width')
>>> return top,bottom,left,right
>>> def get_geetest_image(self):
>>> # 获取验证码位置
>>> top,bottom,left,right = self.get_img_position()
>>> screenshot = self.get_screenshot()
>>> captcha = screenshot.crop((left,top,right,bottom))
>>> return captcha
>>> def get_slider(self):
>>> # 获取滑块
>>> slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_slider_button')))
>>> return slider
>>> def is_pixel_equal(self,image1,image2,x,y):
>>> # 判断图片是否相同
>>> pixel1 = image1.load()[x,y]
>>> pixel2 = image2.load()[x,y]
>>> threshold = 60
>>> if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1]-pixel2[1]) < threshold and abs(pixel1[2]-pixel2[2]) < threshold:
>>> return True
>>> else:
>>> return False
>>> def get_gap(self,image1,image2):
>>> # 获取缺口位置
>>> left = 60
>>> for i in range(left,image1.size[0]):
>>> for j in range(image1.size[1]):
>>> if not self.is_pixel_equal(image1,image2,i,j):
>>> left = i
>>> return left
>>> return left
>>> def get_track(self,distance):
>>> # 计算运动轨迹
>>> track = [] # 移动轨迹
>>> current = 0 # 当前位移
>>> mid = distance*4/5 # 减速阈值
>>> t = 0.2 # 间隔时间
>>> v = 0 # 初速度
>>> while current < distance:
>>> if current < mid:
>>> a = 2 # 加速度
>>> else:
>>> a = -3
>>> v0 = v #初速度
>>> v =v0 + a*t # 当前速度
>>> move = v0*t + (1/2)*a*t*t # 移动距离
>>> current += move
>>> track.append(round(move))
>>> return track
>>> def move_to_gap(self,slider,tracks):
>>> # 移动滑块
>>> ActionChains(self.browser).click_and_hold(slider).perform() # 按住滑块
>>> for x in tracks:
>>> ActionChains(self.browser).move_by_offset(xoffset=x,yoffset=0).perform()
>>> time.sleep(0.5)
>>> ActionChains(self.browser).release().perform()
>>> def sort_random_geetest(self):
>>> # 判断第二次验证
>>> image1 = self.get_screenshot()
>>> if image1:
>>> image2 = self.get_geetest_image()
>>> slider = self.get_slider()
>>> slider.click()
>>> gap = self.get_gap(image1=image1, image2=image2)
>>> track = self.get_track(gap)
>>> self.move_to_gap(slider, track)
>>> def crack_geek(self):
>>> # 第一步,先输入用户名和密码
>>> self.browser.get(self.url)
>>> self.sort_username_password()
>>> # 第二步,完成点击验证
>>> button = self.get_button()
>>> button.click()
>>> # 第三部,完成随机验证
>>> self.sort_random_geetest()
>>>if __name__ == '__main__':
>>> gs = Geetest_sample('test','test')
>>> gs.crack_geek()
2.4 点触验证吗
- 这类验证码需要借助第三方API解决。
- 这里使用了超级鹰验证码识别平台作为解决方案。
- 需要注意调整验证码截图的尺寸。
>>>from selenium import webdriver
>>>from selenium.webdriver import ActionChains
>>>from selenium.webdriver.support.wait import WebDriverWait
>>>from selenium.webdriver.support import expected_conditions as EC
>>>from selenium.webdriver.common.by import By
>>>from PIL import Image
>>>from io import BytesIO
>>>from hashlib import md5
>>>from selenium.common.exceptions import *
>>>import time
>>>import requests
>>>CHAOJIYING_USERNAME = "yourusername"
>>>CHAOJIYING_PASSWORD = "yourpassword"
>>>CHAOJIYING_SOFT_ID = "yoursoftid"
>>>CHAOJIYING_TYPE = "9004" # 坐标选四,返回格式:x1,y1|x2,y2|x3,y3|x4,y4
>>>class Chaojiying(object):
>>> ```
>>> connect to chaojiying
>>> ```
>>> def __init__(self, username, password, soft_id):
>>> self.username = username
>>> password = password.encode('utf8')
>>> self.password = md5(password).hexdigest()
>>> self.soft_id = soft_id
>>> self.base_params = {
>>> 'user': self.username,
>>> 'pass2': self.password,
>>> 'softid': self.soft_id,
>>> }
>>> self.headers = {
>>> 'Connection': 'Keep-Alive',
>>> 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
>>> }
>>> def PostPic(self, im, codetype):
>>> """
>>> im: 图片字节
>>> codetype: 题目类型 参考 http://www.chaojiying.com/price.html
>>> """
>>> params = {
>>> 'codetype': codetype,
>>> }
>>> params.update(self.base_params)
>>> files = {'userfile': ('ccc.jpg', im)}
>>> r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
>>> headers=self.headers)
>>> return r.json()
>>> def ReportError(self, im_id):
>>> """
>>> im_id:报错题目的图片ID
>>> """
>>> params = {
>>> 'id': im_id,
>>> }
>>> params.update(self.base_params)
>>> r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
>>> return r.json()
>>> class Geetest_sample():
>>> def __init__(self,user_name,password,url='https://auth.geetest.com/login'):
>>> self.url = url
>>> self.user_name = user_name
>>> self.password = password
>>> self.browser = webdriver.Firefox()
>>> self.wait = WebDriverWait(self.browser,10)
>>> self.chaojiying = >>>Chaojiying(CHAOJIYING_USERNAME,CHAOJIYING_PASSWORD,CHAOJIYING_SOFT_ID)
>>> def get_button(self,cls):
>>> # 获取滑块
>>> button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,cls)))
>>> return button
>>> def sort_username_password(self):
>>> input_username = self.browser.find_element(By.CSS_SELECTOR,'.ivu-input')
>>> input_password = self.browser.find_element(By.CSS_SELECTOR,'[placeholder~=请输入密码]')
>>> input_username.send_keys(self.user_name)
>>> input_password.send_keys(self.password)
>>> def get_screenshot(self):
>>> # 获得完整的图片
>>> screenshot = self.browser.get_screenshot_as_png()
>>> screenshot = Image.open(BytesIO(screenshot))
>>> return screenshot
>>> def get_img_position(self,cls):
>>> # 获取验证码图片
>>> img = self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,cls))) # 如果图片验证码出现
>>> time.sleep(2) # 模拟人的反应
>>> location = img[0].location
>>> size = img[0].size
>>> top,bottom,left,right = location.get('y'),location.get('y') + size.get('height'),location.get('x'),location.get('x') + size.get('width')
>>> return top,bottom,left,right
>>> def get_geetest_image(self,cls):
>>> # 获取验证码位置
>>> top,bottom,left,right = self.get_img_position(cls)
>>> print(top,bottom,left,right)
>>> screenshot = self.get_screenshot()
>>> captcha = screenshot.crop((left,top,right,bottom))
>>> return captcha
>>> def is_pixel_equal(self,image1,image2,x,y):
>>> # 判断图片是否相同
>>> pixel1 = image1.load()[x,y]
>>> pixel2 = image2.load()[x,y]
>>> threshold = 60
>>> if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1]-pixel2[1]) < threshold and abs(pixel1[2]-pixel2[2]) < threshold:
>>> return True
>>> else:
>>> return False
>>> def get_gap(self,image1,image2):
>>> # 获取缺口位置
>>> left = 60
>>> for i in range(left,image1.size[0]):
>>> for j in range(image1.size[1]):
>>> if not self.is_pixel_equal(image1,image2,i,j):
>>> left = i
>>> return left
>>> return left
>>> def get_track(self,distance):
>>> # 计算运动轨迹
>>> track = [] # 移动轨迹
>>> current = 0 # 当前位移
>>> mid = distance*4/5 # 减速阈值
>>> t = 0.2 # 间隔时间
>>> v = 0 # 初速度
>>> while current < distance:
>>> if current < mid:
>>> a = 2 # 加速度
>>> else:
>>> a = -3
>>> v0 = v #初速度
>>> v =v0 + a*t # 当前速度
>>> move = v0*t + (1/2)*a*t*t # 移动距离
>>> current += move
>>> track.append(round(move))
>>> return track
>>> def move_to_gap(self,slider,tracks):
>>> # 移动滑块
>>> ActionChains(self.browser).click_and_hold(slider).perform() # 按住滑块
>>> for x in tracks:
>>> ActionChains(self.browser).move_by_offset(xoffset=x,yoffset=0).perform()
>>> time.sleep(0.5)
>>> ActionChains(self.browser).release().perform()
>>> def get_points(self,result):
>>> # 解析结果
>>> groups = result.get('pic_str').split('|')
>>> print(groups)
>>> locations = [[int(number) for number in group.split(',')] for group in groups]
>>> return locations
>>> def click_locations(self,locations):
>>> # 点击验证图片,并点击确认按钮
>>> element = self.browser.find_element_by_class_name("geetest_widget")
>>> for location in locations:
>>> action = ActionChains(self.browser).move_to_element_with_offset(element,location[0],location[1]).click()
>>> action.perform()
>>> time.sleep(1)
>>> button = self.get_button('geetest_commit_tip')
>>> button.click()
>>> def resize_img(self,image):
>>> # 改变图片尺寸
>>> (x, y) = image.size
>>> image = image.resize((x, y), Image.ANTIALIAS)
>>> return image
>>> def sort_by_chaojiying(self,image):
>>> image = self.resize_img(image)
>>> # 超级鹰处理验证码识别
>>> bytes_array = BytesIO()
>>> image.save(bytes_array,format="PNG")
>>> result = self.chaojiying.PostPic(
>>> bytes_array.getvalue(),CHAOJIYING_TYPE
>>> )
>>> return result
>>> def sort_random_geetest(self):
>>> # 获取提示
>>> try:
>>> image2 = self.get_geetest_image('geetest_canvas_img')
>>> except TimeoutException as e:
>>> image2 = None
>>> try:
>>> tip = self.get_img_position("geetest_tip_img")
>>> except TimeoutException as e:
>>> tip = None
>>> if image2:
>>> image1 = self.get_screenshot()
>>> slider = self.get_button('geetest_slider_button')
>>> slider.click()
>>> gap = self.get_gap(image1=image1, image2=image2)
>>> track = self.get_track(gap)
>>> print(track)
>>> self.move_to_gap(slider, track)
>>> elif tip:
>>> git_img = self.get_geetest_image("geetest_widget")
>>> result = self.sort_by_chaojiying(image=git_img)
>>> locations = self.get_points(result)
>>> self.click_locations(locations)
>>> def crack_geek(self):
>>> # 第一步,先输入用户名和密码
>>> self.browser.get(self.url)
>>> self.sort_username_password()
>>> # 第二步,完成点击验证
>>> button = self.get_button('geetest_radar_tip_content')
>>> button.click()
>>> # 第三部,完成随机验证
>>> self.sort_random_geetest()
>>>if __name__ == '__main__':
>>> gs = Geetest_sample('test','test')
>>> gs.crack_geek()
参考资料
- https://blog.csdn.net/u010138758/article/details/80152151 J-Ombudsman
- https://www.cnblogs.com/zhuluqing/p/8832205.html moisiet
- https://www.runoob.com 菜鸟教程
- http://www.tulingxueyuan.com/ 北京图灵学院
- http://www.imooc.com/article/19184?block_id=tuijian_wz#child_5_1 两点水
- https://blog.csdn.net/weixin_44213550/article/details/91346411 python老菜鸟
- https://realpython.com/python-string-formatting/ Dan Bader
- https://www.liaoxuefeng.com/ 廖雪峰
- https://blog.csdn.net/Gnewocean/article/details/85319590 新海说
- https://www.cnblogs.com/Nicholas0707/p/9021672.html Nicholas
- https://www.cnblogs.com/dalaoban/p/9331113.html 超天大圣
- https://blog.csdn.net/zhubao124/article/details/81662775 zhubao124
- https://blog.csdn.net/z59d8m6e40/article/details/72871485 z59d8m6e40
- https://www.jianshu.com/p/2b04f5eb5785 MR_ChanHwang
- 《Python学习手册》Mark Lutz
- 《Python编程 从入门到实践》Eric Matthes
- 《Python3网络爬虫开发实战》崔庆才
本文作者:大师兄(superkmi)