转载: https://www.zhangshengrong.com/p/yOXDZwz3aB/
demo :
import pytesseract
from PIL import Image
image = Image.open("captcha.png")
print(pytesseract.image_to_string(image))
=================================================
=================================================中文识别
import pytesseract
from PIL import Image
image = Image.open("00.jpg")
print(pytesseract.image_to_string(image,lang='chi_sim'))
有时候文本识别率并不高,建议图像识别前,先对图像进行灰度化和 二值化
import pytesseract
from PIL import Image
file = r"00.jpg"
# 先对图像进行灰度化和 二值化
image = Image.open(file)
Img = image.convert('L') # 灰度化
#自定义灰度界限,这里可以大于这个值为黑色,小于这个值为白色。threshold可根据实际情况进行调整(最大可为255)。
threshold = 180
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
photo = Img.point(table, '1') #图片二值化
#保存处理好的图片
photo.save('01.jpg')
image = Image.open('01.jpg')
# 解析图片,lang='chi_sim'表示识别简体中文,默认为English
# 如果是只识别数字,可再加上参数config='--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789'
content = pytesseract.image_to_string(image, lang='chi_sim')
print(content)
-
实战案例–实现古诗文网验证码自动识别登录
import pytesseract
from PIL import Image
from selenium import webdriverdef save_captcha(path):
driver = webdriver.Chrome() # 创建浏览器对象
driver.maximize_window()
driver.implicitly_wait(10)
driver.get(url=url)
image = driver.find_element_by_id('imgCode')
image.screenshot(path)
return driverdef recognize_captcha(captcha_path):
captcha = Image.open(captcha_path) # 打开图片
grap = captcha.convert('L') # 对图片进行灰度化处理
data = grap.load() # 将图片对象加载成数据
w, h = captcha.size # 获取图片的大小(宽度,高度)
# 图片二值化处理
for x in range(w):
for y in range(h):
if data[x, y] < 140:
data[x, y] = 0
else:
data[x, y] = 255
code = pytesseract.image_to_string(grap) # 对图片进行识别
return codedef login(driver, code):
flag = True
email = '1242931802@qq.com' # 注册的古诗文网账号和密码
password = 'xxxx'
try:
driver.find_element_by_id('email').send_keys(email)
driver.find_element_by_id('pwd').send_keys(password)
driver.find_element_by_id('code').send_keys(code)
driver.implicitly_wait(10)
driver.find_element_by_id('denglu').click()
except Exception as ex:
flag = False
return flagif name == 'main':
url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
captcha_path = './captcha.png'
count = 1
driver = save_captcha(captcha_path) # 获取驱动
code = recognize_captcha(captcha_path) # 获取验证码
print('识别验证码为:', code)
if login(driver, code):
driver.quit()
效果如下(有时候第一次可能识别失败,可以写个循环逻辑让它多识别几次,一般程序运行1-3次基本会识别成功):