Abbyy 图片(PDF) 转word 实践
1. 简介
Abbyy 是一个俄罗斯软件,官网:https://www.abbyy.com (中文: https://www.abbyy.com ) 作为OCR软件拥有自己的客户端通知提供SDK供开发使用,功能强大。这里是一个简单的OCR图片文字识别Demo。
2. 干货
1. 安装
(1)从http://www.abbyychina.com/xiazai.html 下载ABBYY FineReader。
(2)安装该引擎。
(3)激活序列号:在License Manager中激活你的序列号(该序列号会出现在后文的代码中)。如图:
到此为止准备工作就OK了。
2.实践(单个转换)
执行流程:
(1)自定义一个配置文件,用户设置文件转换路径、用户序列号等
public class FreConfig
{
/// <summary>
/// 是否64位
/// </summary>
/// <returns></returns>
private static bool Is64Bit
{
get { return IntPtr.Size == 8; }
}
/// <summary>
/// 开发者序列号
/// </summary>
/// <returns></returns>
public static string DeveloperSN
{
get { return "xxxxxxxxx"; }
}
/// <summary>
/// dll 文件夹路径
/// </summary>
public static string DllFolder
{
get
{
if (Is64Bit)
{
return "C:\\Program Files\\ABBYY SDK\\11\\FineReader Engine\\Bin64";
}
else
{
return "32位dll地址";
}
}
}
/// <summary>
/// 文件存放路径
/// </summary>
public static string FileFolder
{
get { return @"C:\Users\帝子降兮\Desktop\AbbyyImage"; }
}
}
==注意: 1. DeveloperSN 就是之前在License Manager中激活的序列号 2. 这里是实践64bit机器==
(2) 加载引擎
/// <summary>
/// 装载和初始化(卸载)引擎
/// </summary>
public class EngineLoader : IDisposable
{
[DllImport("kernel32.dll")]
private static extern IntPtr LoadLibraryEx(string dllToLoad, IntPtr reserved, uint flags);
private const uint LOAD_WITH_ALTERED_SEARCH_PATH = 0x00000008;
[DllImport("kernel32.dll")]
private static extern IntPtr GetProcAddress(IntPtr hModule, string procedureName);
[DllImport("kernel32.dll")]
private static extern bool FreeLibrary(IntPtr hModule);
[UnmanagedFunctionPointer(CallingConvention.StdCall, CharSet = CharSet.Unicode)]
private delegate int GetEngineObject(string devSN, ref IEngine engine);
[UnmanagedFunctionPointer(CallingConvention.StdCall)]
private delegate int DeinitializeEngine();
[UnmanagedFunctionPointer(CallingConvention.StdCall)]
private delegate int DllCanUnloadNow();
private IEngine engine = null;
private IntPtr dllHandle = IntPtr.Zero;
private GetEngineObject getEngineObject = null;
private DeinitializeEngine deinitializeEngine = null;
private DllCanUnloadNow dllCanUnloadNow = null;
/// <summary>
/// 引擎对象
/// </summary>
public IEngine Engine
{
get
{
return engine;
}
}
/// <summary>
/// 加载引擎
/// </summary>
/// <param name="developerSN">开发者序列号</param>
public EngineLoader(string developerSN)
{
string enginePath = Path.Combine(FreConfig.DllFolder, "FREngine.dll");
try
{
dllHandle = LoadLibraryEx(enginePath, IntPtr.Zero, LOAD_WITH_ALTERED_SEARCH_PATH);
if (dllHandle == IntPtr.Zero)
{
throw new Exception("无法加载" + enginePath);
}
IntPtr getEngineObjectPtr = GetProcAddress(dllHandle, "GetEngineObject");
if (getEngineObjectPtr == IntPtr.Zero)
{
throw new Exception("无法找到 GetEngineObject 函数");
}
IntPtr deinitializeEnginePtr = GetProcAddress(dllHandle, "DeinitializeEngine");
if (deinitializeEnginePtr == IntPtr.Zero)
{
throw new Exception("无法找到 DeinitializeEngine 函数");
}
IntPtr dllCanUnloadNowPtr = GetProcAddress(dllHandle, "DllCanUnloadNow");
if (dllCanUnloadNowPtr == IntPtr.Zero)
{
throw new Exception("无法找到 DllCanUnloadNow 函数");
}
//将指针转换为委托
getEngineObject = (GetEngineObject)Marshal.GetDelegateForFunctionPointer(getEngineObjectPtr, typeof(GetEngineObject));
deinitializeEngine = (DeinitializeEngine)Marshal.GetDelegateForFunctionPointer(deinitializeEnginePtr, typeof(DeinitializeEngine));
dllCanUnloadNow = (DllCanUnloadNow)Marshal.GetDelegateForFunctionPointer(dllCanUnloadNowPtr, typeof(DllCanUnloadNow));
//获取引擎对象
int hresult = getEngineObject(developerSN, ref engine);
Marshal.ThrowExceptionForHR(hresult);
}
catch
{
engine = null;
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
FreeLibrary(dllHandle);
dllHandle = IntPtr.Zero;
getEngineObject = null;
deinitializeEngine = null;
dllCanUnloadNow = null;
}
}
/// <summary>
/// 卸载引擎
/// </summary>
public void Dispose()
{
if (engine == null)
{
return;
}
engine = null;
int hresult = deinitializeEngine();
GC.Collect();
GC.WaitForPendingFinalizers();
GC.Collect();
hresult = dllCanUnloadNow();
if (hresult == 0)
{
FreeLibrary(dllHandle);
}
dllHandle = IntPtr.Zero;
getEngineObject = null;
deinitializeEngine = null;
dllCanUnloadNow = null;
Marshal.ThrowExceptionForHR(hresult);
}
}
(3) 执行逻辑
public void Start()
{
try
{
using (EngineLoader loade = new EngineLoader(FreConfig.DeveloperSN))
{
//加载配置文件,其他配置文件可见帮助文档
loade.Engine.LoadPredefinedProfile("DocumentConversion_Accuracy");
string imagePath = Path.Combine(FreConfig.FileFolder, @"20161121103603.JPG");
//设置引擎的消息语言(默认英语)
loade.Engine.MessagesLanguage = MessagesLanguageEnum.ML_ChinesePRC;
FRDocument document = loade.Engine.CreateFRDocument();
try
{
//将图片作为照片处理
var prepareMode = loade.Engine.CreatePrepareImageMode();
prepareMode.PhotoProcessingMode = PhotoProcessingModeEnum.PPM_TreatAsPhoto;
//设置解析参数
DocumentProcessingParams processingParams = loade.Engine.CreateDocumentProcessingParams();
//设置解析的语言(这儿选择中英混合,详见帮助文档)
RecognizerParams recognizerParams = processingParams.PageProcessingParams.RecognizerParams;
recognizerParams.SetPredefinedTextLanguage("ChinesePRC,English");
//添加文件到document
document.AddImageFile(imagePath, prepareMode, null);
// 执行解析
document.Process(processingParams);
//var x = document.BasicLanguage;
//导出指定格式
document.Export(Path.Combine(FreConfig.FileFolder, @"Demo.docx"), FileExportFormatEnum.FEF_DOCX, null);
}
catch (Exception ex)
{
}
finally
{
document.Close();
}
}
}
catch (Exception ex)
{
}
}
直接调用Start() 方法,一个简单的示例就完成了。
注意: 在测试的时候一定要注意SetPredefinedTextLanguage,默认是英文,如果你的图片或pdf里面是中文的话,==会出现乱码==,同时abbyy 支持设置导出后文档的格式、识别图片文字时旋转,背景色控制等等功能,详见帮助文档
3. 实践(批量转换)
执行流程与单个转换的执行流程大致相似,只是我们创建的不是FRDocument 而是BatchProcessor,同时批量执行时 需要实现IImageSource接口,完成图片(PDF)文件的遍历。
public class FileAdapterImpl : IFileAdapter
{
private string fileName;
public FileAdapterImpl(string fileName)
{
this.fileName = fileName;
}
public string GetFileName()
{
return fileName;
}
public IntsCollection GetPagesToProcess()
{
return null;
}
public string GetPassword()
{
return string.Empty;
}
}
public class ImageSourceImpl : IImageSource
{
private bool isEmpty;
private Queue<string> imagesNames = new Queue<string>();
public ImageSourceImpl(string sourceDir)
{
string extensionsMask = "bmp|dcx|pcx|png|jpg|jpeg|jp2|jpc|jfif|pdf|tif|tiff|gif|djvu|djv|jb2";
string[] fileNames = Directory.GetFiles(sourceDir, "*.*");
foreach (string fileName in fileNames)
{
if (extensionsMask.Contains(Path.GetExtension(fileName).Remove(0, 1).ToLower()))
{
imagesNames.Enqueue(fileName);
}
}
isEmpty = imagesNames.Count == 0;
}
public IFileAdapter GetNextImageFile()
{
if (isEmpty)
{
return null;
}
FileAdapterImpl fileAdapter = new FileAdapterImpl(imagesNames.Dequeue());
isEmpty = imagesNames.Count == 0;
return fileAdapter;
}
public bool IsEmpty()
{
return this.isEmpty;
}
}
执行逻辑:
public void Start()
{
try
{
using (EngineLoader loade = new EngineLoader(FreConfig.DeveloperSN))
{
loade.Engine.LoadPredefinedProfile("DocumentConversion_Accuracy");
if (!Directory.Exists(sourceFolder))
{
throw new Exception(sourceFolder + "不存在");
}
if (!Directory.Exists(resultFolder))
{
DirectoryInfo newDir = Directory.CreateDirectory(resultFolder);
if (!newDir.Exists)
{
throw new Exception("无法创建" + resultFolder);
}
}
ImageSourceImpl imageSource = new ImageSourceImpl(sourceFolder);
if (imageSource.IsEmpty())
{
throw new Exception("转换文件夹中没有文件");
}
//创建批量处理
BatchProcessor batchProcessor = loade.Engine.CreateBatchProcessor();
//设置语言
PageProcessingParams processingParams = loade.Engine.CreatePageProcessingParams();
processingParams.RecognizerParams.SetPredefinedTextLanguage("ChinesePRC,English");
//开始解析
batchProcessor.Start(imageSource, null, null, processingParams, null);
FRPage page = batchProcessor.GetNextProcessedPage();
while (page != null)
{
//序列化
page.Synthesize(null);
//导出目标文件
string resultFilePath = Path.Combine(resultFolder, Path.GetFileName(page.SourceImagePath) + ".docx");
page.Export(resultFilePath, FileExportFormatEnum.FEF_DOCX, null);
page = batchProcessor.GetNextProcessedPage();
}
}
}
catch (Exception ex)
{
}
}
一个简单的demo就完成了,更多高级的功能可以查看帮助文档(其他地方貌似也查不到。。。)
帮助文档路径:C:\Program Files\ABBYY SDK\11\FineReader Engine\Help\FREngine11.chm,我是默认装在C盘的,你可以根据自己的安装路径查找。