在验证码的图像验证码识别中以前经常用到tesseract;但它的识别率较低,故现在已不常用。
pip install tesseract 安装三方库
现在用于普通图像验证码识别的主要有pillow图像处理库及与超级鹰第三方网站联合使用。
pip install pillow
1考生之家验证码识别
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import quote
from lxml import etree
from PIL import Image
from io import BytesIO
from chaojiying import main1
import time
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless') # 无头浏览器,如果使用,将不打开浏览器,但功能都会正常执行
browser = webdriver.Chrome(chrome_options=chrome_options)
# browser = webdriver.Chrome()
browser.set_window_size(1400, 700)
# 显式等待 针对某个节点的等待
wait = WebDriverWait(browser, 10)
def get_page():
url = 'http://bm.e21cn.com/log/reg.aspx'
browser.get(url)
html = browser.page_source
return html
# 取浏览器窗口内全图
def get_big_image():
browser.execute_script('window.scrollTo(0, 300)')
screenshot = browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
return screenshot
# 取验证码坐标位置(左上角和右下角)
def get_position():
img = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#imgCheckCode')))
loc = img.location
size = img.size
print(loc)
print(size)
x1 = loc['x']
# 记住减去滚动高度
y1 = loc['y'] - 300
x2 = loc['x'] + size['width']
y2 = y1 + size['height']
return (x1, y1, x2, y2)
def parse_html(html):
# etree_html = etree.HTML(html)
screenshot = get_big_image()
screenshot.save('full_screen.png')
x1, y1, x2, y2 = get_position()
crop_image = screenshot.crop((x1, y1, x2, y2))
file_name = 'crop.png'
crop_image.save(file_name)
captha_str = main1(file_name)
username = '剑圣'
password = '123456'
tel = '18362537333'
print(captha_str)
input_username = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#username')))
input_password1 = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#pwd')))
input_password2 = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#pwd_Q')))
input_tel = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#tel')))
input_check = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#CheckCode')))
sublime = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'input#btn_login')))
input_username.send_keys(username)
input_password1.send_keys(password)
input_password2.send_keys(password)
input_tel.send_keys(tel)
input_check.send_keys(captha_str)
time.sleep(2)
sublime.click()
def main():
html = get_page()
parse_html(html)
if __name__ == '__main__':
main()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # 无头浏览器,如果使用,将不打开浏览器,但功能都会正常执行
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.execute_script('window.scrollTo(0, 300)')所获取的输入框及验证码框在屏幕上不能全部展示,故将页面向下移动300像素。
screenshot = browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot)) 对页面进行截图
y1 = loc['y'] - 300 对验证码进行定位的过程中由于向下移动了300,故需要将其减掉。来获取验证码的准确位置。
2移动登陆网页验证码获取
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from PIL import Image
from io import BytesIO
from chaojiying import main1
import time
chrome_options = webdriver.ChromeOptions()
browser = webdriver.Chrome(chrome_options=chrome_options)
# 设置屏幕大小及加载等待时间
browser.set_window_size(1366, 768)
wait = WebDriverWait(browser, 3)
# 获取网页
def get_page():
url = 'https://login.10086.cn/html/register/register.html'
browser.get(url)
html = browser.page_source
return html
# 截全屏
def get_big_image():
# browser.execute_script('window.scrollTo(0, 300)')
screenshot = browser.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
return screenshot
# 取验证码坐标位置(左上角和右下角)
def get_position():
img = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#captchaImg')))
loc = img.location
size = img.size
print(loc)
print(size)
x1 = loc['x']
# 记住减去滚动高度
y1 = loc['y']
x2 = loc['x'] + size['width']
y2 = y1 + size['height']
return (x1, y1, x2, y2)
# 解析网页
def parse_html(html):
# 获取屏幕截图
screenshot = get_big_image()
# 保存屏幕截图
screenshot.save('full_screen.png')
# 获取验证码坐标
x1, y1, x2, y2 = get_position()
# 截小图并保存
crop_image = screenshot.crop((x1, y1, x2, y2))
file_name = 'crop.png'
crop_image.save(file_name)
# 调用超级鹰解析验证码
captha_str = main1(file_name)
username = '1382@cdn.com'
password = '123456'
print(captha_str)
input_username = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#loginName')))
input_password1 = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#newPassword')))
input_password2 = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#newPasswordRepeat')))
input_check = wait.until(EC.presence_of_element_located
((By.CSS_SELECTOR, 'input#inputCode')))
sublime = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'input#regSub')))
input_username.send_keys(username)
input_password1.send_keys(password)
input_password2.send_keys(password)
input_check.send_keys(captha_str)
time.sleep(2)
sublime.click()
def main():
html = get_page()
parse_html(html)
if __name__ == '__main__':
main()
screenshot = get_big_image() 获取屏幕截图
screenshot.save('full_screen.png') 保存屏幕截图
获取验证码坐标
x1, y1, x2, y2 = get_position()
crop_image = screenshot.crop((x1, y1, x2, y2)) 截小图并保存
file_name = 'crop.png'
crop_image.save(file_name)
captha_str = main1(file_name) 传入小图并调用超级鹰解析验证码