依赖
<!-- tess4j 解析图片 -->
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.5.4</version>
</dependency>
<!-- 引入 PDFBox 相关的依赖 解析pdf -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>xmpbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>preflight</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jempbox</artifactId>
<version>1.8.17</version>
</dependency>
下载训练数据资源
点此下载,解压
将加载目录设置到这一级
springboot 配置读取训练资源
- application.yml
# 训练数据文件夹的路径
tess4j:
datapath: C:/Users/98473/Downloads/tessdata-main/tessdata-main
- 加载配置
import net.sourceforge.tess4j.Tesseract;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
/**
* @author Jenson
* @version 1.0
*/
@Configuration
public class TesseractOcrConfig {
@Value("${tess4j.datapath}")
private String dataPath;
@Bean
public Tesseract tesseract() {
Tesseract tesseract = new Tesseract();
// 设置训练数据文件夹路径
tesseract.setDatapath(dataPath);
// 设置为 中文简体 和 英文, +代表使用多语言识别
tesseract.setLanguage("eng+chi_sim");
return tesseract;
}
}
解析纯图pdf为文本
//解析pdf
String analysisPdf(String filePath) throws IOException, TesseractException {
PDDocument doc = PDDocument.load(new File(filePath));
// 获取PDF文档中的文本内容
String text = new PDFTextStripper().getText(doc);
if (!StringUtils.hasText(text)) {
// 可能是纯图pdf
log.info("----> 作为纯图pdf解析");
StringBuilder stringBuilder = new StringBuilder();
int pageNum = doc.getNumberOfPages();
for (int i = 0; i < pageNum; i++) {
PDPage page = doc.getPage(i);
PDResources resources = page.getResources();
Iterable<COSName> cosNameIterable = resources.getXObjectNames();
for (COSName cosName : cosNameIterable) {
if (resources.isImageXObject(cosName)) {
PDImageXObject pdxObject = (PDImageXObject) resources.getXObject(cosName);
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
ImageIO.write(pdxObject.getImage(), "PNG", byteArrayOutputStream);
// ocr识别文本
ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray());
BufferedImage bufferedImage = ImageIO.read(byteArrayInputStream);
// 对图片进行文字识别
String imageText = tesseract.doOCR(bufferedImage);
stringBuilder.append(imageText);
byteArrayOutputStream.close();
byteArrayInputStream.close();
}
}
}
text = stringBuilder.toString();
}
// 输出文本内容
return text;
}
参考
https://blog.csdn.net/benben_521ben/article/details/127332675
https://blog.csdn.net/mxt51220/article/details/133809953