python实现PDF文档间对比（百度文本识别接口）

一、原理
1、将PDF文档每页转换为图像
2、调用百度通用文本识别页面接口，对图像进行内容识别
3、对图像内容进行对比，并将对比不一致的内容在文档图像上进行标记（红框）
4、将对比结果表格输出为html，以便进行识别

二、范围和限制
1、目前仅支持PDF文档之间的对比
2、无法识别图形（盖章和logo）、不清晰字迹
3、需要联网使用（OCR使用的是百度通用文本识别接口，仅限测试使用，暂不限次数）
4、对比存在误差（原因为百度OCR识别无法达到100%准确）

三、安装库
pip install pymupdf
pip install requests

四、参数
originPDF: PDF文档原件路径
contrastPDF: PDF文档扫描件路径
resultRoot: 输出结果路径（提示：程序运行后会清空该目录，请不要直接设置桌面）
输出 : 标注差异的文档图像、Html文档

五、源码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = '孙思锴'

import os
import shutil
import fitz
import difflib
from datetime import datetime
import base64
from PIL import Image
from PIL import ImageDraw
import requests
from concurrent.futures import ThreadPoolExecutor

session = requests.session()
originDic = {}  # 空字典，用于保存原件中每一页对比不一致的文本
contrastDic = {}  # 文档扫描件
url = 'https://ai.baidu.com/aidemo'  # 百度文本识别接口URL
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
}


def initRoot(rootPath):
    """
    初始化目录
    :param rootPath:
    :return:rootPath
    """
    rootPath = os.path.abspath(rootPath)
    if os.path.exists(rootPath):
        # 检查用于放图片的目录是否存在，是的话删除
        shutil.rmtree(rootPath)  # 清空图片目录
    os.makedirs(rootPath)  # 创建图片目录
    return rootPath


def conver_img(pdfFilepath, outputPath):
    """
    pdf转换PNG图片
    :param outputPath: PNG图片输出路径
    :param pdfFilepath: pdf文档路径
    :return: doc.pageCount, ImagePath 文档图像张数，保存地址
    """

    pdfFilepath = os.path.abspath(pdfFilepath)  # 绝对路径
    if not os.path.exists(pdfFilepath):
        # 检查文件是否存在
        print('文件不存在：', pdfFilepath)
        exit(0)

    # 获取文件同名目录和类型
    pdfName = os.path.basename(pdfFilepath)  # 返回文件名
    pdfNamePath, extension = os.path.splitext(pdfName)
    ImagePath = os.path.join(outputPath, pdfNamePath)  # pdf文档图像保存地址
    if os.path.exists(ImagePath):
        # 检查用于放图片的目录是否存在，是的话删除
        shutil.rmtree(ImagePath)  # 清空图片目录
    os.makedirs(ImagePath)  # 创建图片目录

    # 读取文件
    doc = fitz.open(pdfFilepath)
    for page_index in range(doc.pageCount):
        page = doc[page_index]  # 逐页读取pdf
        # 每个尺寸的缩放系数为2，这将为我们生成分辨率提高四倍的图像。
        zoom_x = 2.0
        zoom_y = 2.0
        trans = fitz.Matrix(zoom_x, zoom_y)  # .preRotate(0)  # .preRotate(rotate)是执行一个旋转
        pm = page.getPixmap(matrix=trans, alpha=False)
        pm.writePNG(os.path.join(ImagePath, str(page_index) + '.png'))  # 保存图片
    return doc.pageCount, ImagePath


def getImageInfo(filename):
    """
    调用百度接口进行图像内容识别，通用文本识别（高精度含位置版）
    1、将image转为base64
    2、拼装请求，发送请求
    3、检验请求结果，返回
    :param filename:图片地址
    :return:json
    {'errno': 102, 'msg': '请求Demo过于频繁', 'data': ''}
    {'errno': 106, 'msg': '文件类型错误', 'data': ''}
    {'errno': 0, 'msg': 'success', 'data': {'log_id': '9163508383702196122', 'words_result_num': 30, 'words_result': [{'location': {'width': 142, 'top': 87, 'left': 202, 'height': 41}, 'words': '发银行'}, {'location': {'width': 86, 'top': 106, 'left': 909, 'height': 28}, 'words': '保密协议'}]}}
    """

    with open(filename, 'rb') as f:
        base64image = base64.b64encode(f.read()).decode()
        base64image = 'data:image/png;base64,' + base64image
    dic = {
        "image": base64image,
        "image_url": "",
        "type": "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate",
        "detect_direction": "false",
    }
    for _ in range(5):
        # 循环5次进行请求，防止请求过程提示请求繁忙
        result = session.post(url=url, headers=headers, data=dic).json()
        if result['errno'] == 102:
            continue
        return result


def imageDiff(resultRoot, originFile, contrastFile, page=1):
    """
    对比两张照片的区别
    :param resultRoot: 输出目录
    :param originFile: 源文件
    :param contrastFile: 扫描件
    :param page: 页数
    :return:
    """
    # 通过百度接口识别内容
    originResult = getImageInfo(filename=originFile)  # 识别原件内容
    contrastResult = getImageInfo(filename=contrastFile)  # 识别扫描件内容

    offset = 40  # 设置偏差值，防止原文档图像和扫描版图像出现位置偏差
    # 将原件的所有词块，一个个拿去扫描版的里对比，若位置偏差在设置范围内和词性一致，则评定词块相等
    for origin_words in originResult['data']['words_result'][:]:
        # 获取词块的相关位置信息
        left, top = origin_words['location']['left'], origin_words['location']['top']
        # right, bottom = left + origin_words['location']['width'], top + origin_words['location']['height']
        for contrast_words in contrastResult['data']['words_result'][:]:
            # 获取词块的相关位置信息
            result_left, result_top = contrast_words['location']['left'], contrast_words['location']['top']
            # result_right, result_bottom = result_left + contrast_words['location']['width'], result_top + \
            #                               contrast_words['location']['height']
            if abs(top - result_top) < offset:
                # 判断词块距离顶部的位置是否在偏差范围内，可理解为两个词块位置是否一致
                if origin_words['words'] == contrast_words['words']:
                    contrastResult['data']['words_result'].remove(contrast_words)  # 删除原件词块
                    originResult['data']['words_result'].remove(origin_words)  # 删除原件词块
                    break  # 已找到词块退出循环
                elif origin_words['words'] in contrast_words['words']:
                    # 说明扫描件内容和原件不一样
                    originResult['data']['words_result'].remove(origin_words)  # 删除原件词块
                    contrast_words['words'] = contrast_words['words'].replace(origin_words['words'], '', 1)
                    break  # 已找到词块退出循环

    # 文档图像标注，画框标注出不一样的内容
    originImage = Image.open(originFile)
    originDraw = ImageDraw.ImageDraw(originImage)
    originText = ''  # 保存对比不一致的文本
    for words in originResult['data']['words_result']:
        originText += words['words'] + '\n'
        left, top = words['location']['left'], words['location']['top']
        right, bottom = left + words['location']['width'], top + words['location']['height']
        originDraw.rectangle(((left, top), (right, bottom)), outline='red', width=2)
    originDic[page] = originText  # 空字典，用于保存原件中每一页对比不一致的文本

    contrastImage = Image.open(contrastFile)
    contrastDraw = ImageDraw.ImageDraw(contrastImage)
    contrastText = ''
    for words in contrastResult['data']['words_result']:
        # 获取扫描版的每个词块
        contrastText += words['words'] + '\n'
        left, top = words['location']['left'], words['location']['top']
        right, bottom = left + words['location']['width'], top + words['location']['height']
        contrastDraw.rectangle(((left, top), (right, bottom)), outline='red', width=2)
    contrastDic[page] = contrastText  # 文档扫描件

    # 图像合并，生成对比图
    originSize = originImage.size  # 获取原始照片大小
    contrastSize = contrastImage.size  # 获取扫描件大小
    newImage_width = originSize[0] + contrastSize[0]
    newImage_hight = originSize[1] if originSize[1] > contrastSize[1] else contrastSize[1]
    new_Image = Image.new('RGB', (newImage_width, newImage_hight), "#000000")
    new_Image.paste(originImage, (0, 0))
    new_Image.paste(contrastImage, (originSize[0], 0))
    new_Image.save(os.path.join(resultRoot, "第" + str(page) + '页文档.png'))


if __name__ == '__main__':
    startTime = datetime.now()
    # 读取要对比的文件
    originPDF = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\测试文档-扫描件.pdf'  # 文档原件
    contrastPDF = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\测试文档-改字.pdf'  # 文档扫描件
    resultRoot = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\对比结果'  # 输出目录

    resultRoot = initRoot(resultRoot)  # 清空输出目录
    originImageNum, originImagePath = conver_img(originPDF, resultRoot)  # 将原件pdf文档转换为图像
    contrastImageNum, contrastImagePath = conver_img(contrastPDF, resultRoot)  # 将扫描件pdf文档转换为图像
    if originImageNum != contrastImageNum:
        print('文档页数不一致！请查看', resultRoot)
        exit(0)
    resultRoot = os.path.join(resultRoot, '对比结果')  # 创建输出结果目录
    os.makedirs(resultRoot)  # 创建输出目录
    executor = ThreadPoolExecutor()  # 开启线程池
    for i in range(originImageNum):
        originFile = os.path.join(originImagePath, str(i) + '.png')
        contrastFile = os.path.join(contrastImagePath, str(i) + '.png')
        executor.submit(imageDiff, resultRoot, originFile, contrastFile, i + 1)  # 图像对比
    executor.shutdown(wait=True)  # 等待线程池为空后，关闭线程池

    # 输出对比到Html文件
    diff = difflib.HtmlDiff()
    with open(os.path.join(resultRoot, '结果.html'), 'w', encoding="utf-8") as f:
        for i in range(originImageNum):
            make_content = diff.make_file(fromlines=originDic[i + 1].splitlines(),
                                          tolines=contrastDic[i + 1].splitlines(),
                                          fromdesc='原件第' + str(i + 1) + '页', todesc='扫描件第' + str(i + 1) + '页')
            f.write(make_content)

    session.close()  # 关闭Session
    endTime = datetime.now()
    print('文档共', originImageNum, '页，执行总时间：', endTime - startTime)
    print('执行成功，请查看输出目录：', resultRoot)

六、执行结果示例：

标注差异的对比照片

Html文档表格

python实现PDF文档间对比（百度文本识别接口）

python实现PDF文档间对比（百度文本识别接口）

推荐阅读更多精彩内容

友情链接更多精彩内容