1、背景
这两年是在一家税务公司,业务主要与发票相关联,其中涉及本地发票文件的OCR识别与不同格式文件内容提取,我这边负责了OFD本地文件提取的实现,使用相对应依赖解析数据后其实是对xml数据的提取过程,以下是ofd文件说明:
OFD(Open Fixed-layout Documents的简称,意为开放版式文件)版式文档是版面呈现效果高度精确固定的电子文件,其呈现与设备无关。与pdf文件相仿,具有格式独立、版面固定、固化呈现等特点。可以说OFD是中国版的PDF,但是在很多方面的性能优于PDF的同类文档。OFD也逐渐开始在电子发票、电子公文、电子证照等等的领域中应用。
2、示例
定义一个发票提取器InvExtractor
/**
* @author wenx
* @description 定义xml发票解析提取器
*/
public interface InvExtractor {
/**
* 解析文件返回 root document
*
* @param file
* @return
*/
Element extract(File file);
/**
* document返回 系统发票信息map
*
* @param root
* @param file
* @return linkedHashMap
*/
Map<String, Object> covertInvMap(Element root, File file);
}
实现提取器
/**
* 全电ofd 提取器
*
* @author wenx
*/
@Slf4j
public class OfdElecExtractor implements InvExtractor {
//全电压缩文件entrys key
private String[] entrys = {
"Doc_0/Annots/Page_0/Annotation.xml",
"Doc_0/DocumentRes.xml",
"Doc_0/PublicRes.xml",
"Doc_0/Pages/Page_0/Content.xml",
"Doc_0/Tpls/Tpl_0/Content.xml",
"Doc_0/Document.xml",
"Doc_0/Tags/CustomTags.xml",
"Doc_0/Tags/CustomTag.xml",
"OFD.xml",
"Doc_0/Res/image_57.png",
"Doc_0/Annots/Annotations.xml",
"Doc_0/Res/image_101.png",
};
private static Map<String, String> typeMaps = new HashMap<>();
//票面符号为 中文字符
static {
typeMaps.put("FullEleSpecInvoice", "电子发票(增值税专用发票)");
typeMaps.put("FullEleGenerInvoice", "电子发票(普通发票)");
}
@SneakyThrows
@Override
public Element extract(File file) {
String body;
try (ZipFile zipFile = new ZipFile(file)) {
ZipEntry entry = zipFile.getEntry(entrys[3]);
InputStream input = zipFile.getInputStream(entry);
body = StreamUtils.copyToString(input, StandardCharsets.UTF_8);
}
// log.info("params body:{}",body);
Document document = DocumentHelper.parseText(body);
return document.getRootElement();
}
@Override
public Map<String, Object> covertInvMap(Element root, File file) {
//key map
HashMap<String, Object> keyMap = renderKeyMap(file);
log.info("keyMap :{}", keyMap);
//value list
Element content = root.element("Content");
List<Element> layers = content.elements("Layer");
List<Element> valueList = new ArrayList<>();
layers.forEach(l -> {
List<Element> objs = l.elements("TextObject");
if (!CollectionUtils.isEmpty(objs)) {
valueList.addAll(objs);
}
});
//票面信息
LinkedHashMap<String, Object> inv = new LinkedHashMap<>();
valueList.stream()
.filter(o -> keyMap.get("InvoiceNo").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> inv.put("eInvoiceNo", i.elementTextTrim("TextCode")));
valueList.stream()
.filter(o -> keyMap.get("IssueDate").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> inv.put("invoiceTime", i.elementTextTrim("TextCode")));
valueList.stream()
.filter(o -> keyMap.get("BuyerName").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> inv.put("buyerName", i.elementTextTrim("TextCode")));
valueList.stream()
.filter(o -> keyMap.get("BuyerTaxID").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> inv.put("buyerTaxNo", i.elementTextTrim("TextCode")));
valueList.stream()
.filter(o -> keyMap.get("SellerName").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> inv.put("sellerName", i.elementTextTrim("TextCode")));
valueList.stream()
.filter(o -> keyMap.get("SellerTaxID").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> inv.put("sellerTaxNo", i.elementTextTrim("TextCode")));
valueList.stream()
.filter(o -> keyMap.get("InvoiceClerk").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> inv.put("payer", i.elementTextTrim("TextCode")));
if(keyMap.get("Note") != null){
valueList.stream()
.filter(o -> keyMap.get("Note").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> inv.put("remark", i.elementTextTrim("TextCode")));
}
//金额
valueList.stream()
.filter(o -> keyMap.get("TaxExclusiveTotalAmount").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> {
if (!ObjectUtils.isEmpty(i.elementTextTrim("TextCode")))
inv.put("amount", new BigDecimal(i.elementTextTrim("TextCode")));
});
valueList.stream()
.filter(o -> keyMap.get("TaxTotalAmount").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> {
if (!ObjectUtils.isEmpty(i.elementTextTrim("TextCode")))
inv.put("taxAmount", new BigDecimal(i.elementTextTrim("TextCode")));
});
valueList.stream()
.filter(o -> keyMap.get("TaxInclusiveTotalAmount").equals(o.attribute("ID").getValue()))
.findAny().ifPresent(i -> {
if (!ObjectUtils.isEmpty(i.elementTextTrim("TextCode")))
inv.put("sumAmount", new BigDecimal(i.elementTextTrim("TextCode")));
});
//明细
List<LinkedHashMap<String, Object>> detailList = new ArrayList<>();
JSONArray Items = JSON.parseArray(JSON.toJSONString(keyMap.get("Item")));
JSONArray Specifications = JSON.parseArray(JSON.toJSONString(keyMap.get("Specification")));
JSONArray MeasurementDimensions = JSON.parseArray(JSON.toJSONString(keyMap.get("MeasurementDimension")));
JSONArray Amounts = JSON.parseArray(JSON.toJSONString(keyMap.get("Amount")));
JSONArray Prices = JSON.parseArray(JSON.toJSONString(keyMap.get("Price")));
JSONArray TaxAmounts = JSON.parseArray(JSON.toJSONString(keyMap.get("TaxAmount")));
JSONArray TaxSchemes = JSON.parseArray(JSON.toJSONString(keyMap.get("TaxScheme")));
JSONArray Quantitys = JSON.parseArray(JSON.toJSONString(keyMap.get("Quantity")));
for (int i = 0; i < Items.size(); i++) {
LinkedHashMap<String, Object> detail = new LinkedHashMap<>();
int index = i;
if(!ObjectUtils.isEmpty(Items)) {
if (ObjectUtils.isEmpty(Items.get(index))) {
throw new BizException("商品验错误");
}
valueList.stream()
.filter(o -> Items.get(index).equals(o.attribute("ID").getValue()))
.findAny().ifPresent(o -> detail.put("goodName", o.elementTextTrim("TextCode")));
}
if (!ObjectUtils.isEmpty(Specifications)) {
if (ObjectUtils.isEmpty(Specifications.get(index))) {
throw new BizException("商品验错误");
}
valueList.stream()
.filter(o -> Specifications.get(index).equals(o.attribute("ID").getValue()))
.findAny().ifPresent(o -> detail.put("general", o.elementTextTrim("TextCode")));
}
if(!ObjectUtils.isEmpty(MeasurementDimensions)) {
if (ObjectUtils.isEmpty(MeasurementDimensions.get(index))) {
throw new BizException("商品验错误");
}
valueList.stream()
.filter(o -> MeasurementDimensions.get(index).equals(o.attribute("ID").getValue()))
.findAny().ifPresent(o -> detail.put("specifications", o.elementTextTrim("TextCode")));
}
if(!ObjectUtils.isEmpty(Amounts)) {
if (ObjectUtils.isEmpty(Amounts.get(index))) {
throw new BizException("商品验错误");
}
valueList.stream()
.filter(o -> Amounts.get(index).equals(o.attribute("ID").getValue()))
.findAny().ifPresent(o -> detail.put("amount", new BigDecimal(o.elementTextTrim("TextCode"))));
}
if (!ObjectUtils.isEmpty(Prices)) {
if (ObjectUtils.isEmpty(Prices.get(index))) {
throw new BizException("商品验错误");
}
valueList.stream()
.filter(o -> Prices.get(index).equals(o.attribute("ID").getValue()))
.findAny().ifPresent(o -> detail.put("priceIncludeTax", new BigDecimal(o.elementTextTrim("TextCode"))));
}
if (!ObjectUtils.isEmpty(TaxAmounts)) {
if (ObjectUtils.isEmpty(TaxAmounts.get(index))) {
throw new BizException("商品验错误");
}
valueList.stream()
.filter(o -> TaxAmounts.get(index).equals(o.attribute("ID").getValue()))
.findAny().ifPresent(o -> detail.put("taxAmount", new BigDecimal(o.elementTextTrim("TextCode"))));
}
if (!ObjectUtils.isEmpty(Quantitys)) {
if (ObjectUtils.isEmpty(Quantitys.get(index))) {
throw new BizException("商品验错误");
}
valueList.stream()
.filter(o -> Quantitys.get(index).equals(o.attribute("ID").getValue()))
.findAny().ifPresent(o -> detail.put("goodNum", new BigDecimal(o.elementTextTrim("TextCode"))));
}
if (!ObjectUtils.isEmpty(TaxSchemes)) {
if (ObjectUtils.isEmpty(TaxSchemes.get(index))) {
throw new BizException("商品验错误");
}
valueList.stream()
.filter(o -> TaxSchemes.get(index).equals(o.attribute("ID").getValue()))
.findAny().ifPresent(o -> detail.put("taxRate", new BigDecimal(o.elementTextTrim("TextCode")
.replace("%", "")).divide(new BigDecimal(100), 8, RoundingMode.HALF_UP)));
}
detailList.add(detail);
}
inv.put("details", detailList);
//发票抬头
this.extractTitle(inv, file);
return inv;
}
@SneakyThrows
private HashMap<String, Object> renderKeyMap(File file) {
String body;
HashMap<String, Object> keyMap = new HashMap<>();
try (ZipFile zipFile = new ZipFile(file)) {
ZipEntry entry = zipFile.getEntry(entrys[7]);
InputStream input = zipFile.getInputStream(entry);
body = StreamUtils.copyToString(input, StandardCharsets.UTF_8);
if (StringUtils.isNotBlank(body))
//空命名空间 转换异常 问题处理
body = body.replaceAll("<:eInvoice xmlns:=\"\">", "<:eInvoice xmlns:ofd=\"http://www.ofdspec.org/2016\" DocType=\"OFD\" Version=\"1.1\">");
// log.info("KeyMap body:{}",body);
}
Document document = DocumentHelper.parseText(body);
Element root = document.getRootElement();
Element Note = root.element("Note");
if (!ObjectUtils.isEmpty(Note)) {
keyMap.put("Note", Note.elementTextTrim("ObjectRef"));
}
Element IssueDate = root.element("IssueDate");
if (!ObjectUtils.isEmpty(IssueDate)) {
keyMap.put("IssueDate", IssueDate.elementTextTrim("ObjectRef"));
}
Element invoiceNo = root.element("InvoiceNo");
if (!ObjectUtils.isEmpty(invoiceNo)) {
keyMap.put("InvoiceNo", invoiceNo.elementTextTrim("ObjectRef"));
}
//buyer
Element Buyer = root.element("Buyer");
Element BuyerName = Buyer.element("BuyerName");
if (!ObjectUtils.isEmpty(BuyerName)) {
keyMap.put("BuyerName", BuyerName.elementTextTrim("ObjectRef"));
}
Element BuyerTaxID = Buyer.element("BuyerTaxID");
if (!ObjectUtils.isEmpty(BuyerTaxID)) {
keyMap.put("BuyerTaxID", BuyerTaxID.elementTextTrim("ObjectRef"));
}
//seller
Element Seller = root.element("Seller");
Element SellerName = Seller.element("SellerName");
if (!ObjectUtils.isEmpty(SellerName)) {
keyMap.put("SellerName", SellerName.elementTextTrim("ObjectRef"));
}
Element SellerTaxID = Seller.element("SellerTaxID");
if (!ObjectUtils.isEmpty(SellerTaxID)) {
keyMap.put("SellerTaxID", SellerTaxID.elementTextTrim("ObjectRef"));
}
Element TaxExclusiveTotalAmount = root.element("TaxExclusiveTotalAmount");
//固定长度 为2 0:¥ 1:金额
List<Element> amounts = TaxExclusiveTotalAmount.elements("ObjectRef");
if (amounts.size() < 2) {
throw new BizException("ofd 未税金额解析错误");
}
keyMap.put("TaxExclusiveTotalAmount", amounts.get(1).getText());
//固定长度 为2 0:¥ 1:金额
Element TaxTotalAmount = root.element("TaxTotalAmount");
List<Element> taxAmounts = TaxTotalAmount.elements("ObjectRef");
if (taxAmounts.size() < 2) {
throw new BizException("ofd 税额解析错误");
}
keyMap.put("TaxTotalAmount", taxAmounts.get(1).getText());
//固定长度 为2 0:¥ 1:金额
Element TaxInclusiveTotalAmount = root.element("TaxInclusiveTotalAmount");
List<Element> sumAmounts = TaxInclusiveTotalAmount.elements("ObjectRef");
if (sumAmounts.size() < 2) {
throw new BizException("ofd 含税金额解析错误");
}
keyMap.put("TaxInclusiveTotalAmount", sumAmounts.get(1).getText());
List<Element> InvoiceClerks = root.elements("InvoiceClerk");
if (CollectionUtils.isEmpty(InvoiceClerks)) {
throw new BizException("ofd 开票人解析错误");
}
keyMap.put("InvoiceClerk", InvoiceClerks.get(0).elementTextTrim("ObjectRef"));
//明细
List<Element> Items = root.elements("Item");
if (!CollectionUtils.isEmpty(Items)) {
keyMap.put("Item", Items.stream()
.map(e -> e.elementTextTrim("ObjectRef"))
.collect(Collectors.toList()));
}
List<Element> Specifications = root.elements("Specification");
if (!CollectionUtils.isEmpty(Specifications)) {
keyMap.put("Specification", Specifications.stream()
.map(e -> e.elementTextTrim("ObjectRef"))
.collect(Collectors.toList()));
}
List<Element> TaxSchemes = root.elements("TaxScheme");
if (!CollectionUtils.isEmpty(TaxSchemes)) {
keyMap.put("TaxScheme", TaxSchemes.stream()
.map(e -> e.elementTextTrim("ObjectRef"))
.collect(Collectors.toList()));
}
List<Element> MeasurementDimensions = root.elements("MeasurementDimension");
if (!CollectionUtils.isEmpty(MeasurementDimensions)) {
keyMap.put("MeasurementDimension", MeasurementDimensions.stream()
.map(e -> e.elementTextTrim("ObjectRef"))
.collect(Collectors.toList()));
}
List<Element> Amounts = root.elements("Amount");
if (!CollectionUtils.isEmpty(Amounts)) {
keyMap.put("Amount", Amounts.stream()
.map(e -> e.elementTextTrim("ObjectRef"))
.collect(Collectors.toList()));
}
List<Element> TaxAmounts = root.elements("TaxAmount");
if (!CollectionUtils.isEmpty(TaxAmounts)) {
keyMap.put("TaxAmount", TaxAmounts.stream()
.map(e -> e.elementTextTrim("ObjectRef"))
.collect(Collectors.toList()));
}
List<Element> Prices = root.elements("Price");
if (!CollectionUtils.isEmpty(Prices)) {
keyMap.put("Price", Prices.stream()
.map(e -> e.elementTextTrim("ObjectRef"))
.collect(Collectors.toList()));
}
List<Element> Quantitys = root.elements("Quantity");
if (!CollectionUtils.isEmpty(Quantitys)) {
keyMap.put("Quantity", Quantitys.stream()
.map(e -> e.elementTextTrim("ObjectRef"))
.collect(Collectors.toList()));
}
return keyMap;
}
@SneakyThrows
private void extractTitle(LinkedHashMap<String, Object> inv, File file) {
String content;
try (ZipFile zipFile = new ZipFile(file)) {
ZipEntry entry = zipFile.getEntry(entrys[4]);
InputStream input = zipFile.getInputStream(entry);
content = StreamUtils.copyToString(input, StandardCharsets.UTF_8);
}
Document document = DocumentHelper.parseText(content);
Element root = document.getRootElement();
Element c = root.element("Content");
List<Element> lays = c.elements("Layer");
//不确定 发票抬头 在哪个textObject 循环取值
List<String> texts = new ArrayList<>();
List<Element> elements = new ArrayList<>();
lays.forEach(l -> {
List<Element> es = l.elements("TextObject");
if (!CollectionUtils.isEmpty(es)) {
elements.addAll(es);
}
});
elements.stream()
.filter(e -> StringUtils.isNotBlank(e.elementTextTrim("TextCode"))
&& e.elementTextTrim("TextCode").contains("发票"))
.forEach(e -> texts.add(e.elementTextTrim("TextCode")));
if (CollectionUtils.isEmpty(texts)) {
throw new BizException("发票抬头 提取错误");
}
AtomicReference<String> invType = new AtomicReference<>();
for (String text : texts) {
typeMaps.entrySet().stream().filter(e -> text.equals(e.getValue())).findAny()
.ifPresent(e -> invType.set(e.getKey()));
if (!ObjectUtils.isEmpty(invType)) {
break;
}
}
if (StringUtils.isBlank(invType.get())) {
throw new BizException("发票抬头 转换错误");
}
inv.put("invoiceType", invType);
}
}
定义提取器处理chain
public interface ExtractorHandlerChain {
/**
* 通过文件提取发票信息
*
* @param file
* @return
*/
Invoice process(File file);
}
实现chain提取器
/**
* @author wenx
* @description 收票提取为 invoice chain
*/
@Component
public class ExtractorHandlerChainDefault implements ExtractorHandlerChain {
@Override
public Invoice process(@NotNull File file) {
String name = before(file);
InvExtractor extractor = createExtractor(name);
if (ObjectUtil.isEmpty(extractor)) {
throw new BizException("文件提取器获取失败");
}
InvExtractorMapHandler mapHandler = createMapHandler(name);
if (ObjectUtil.isEmpty(mapHandler)) {
throw new BizException("文件提取映射获取失败");
}
Element element = extractor.extract(file);
Map<String, Object> invMap = extractor.covertInvMap(element, file);
if (ObjectUtil.isEmpty(invMap)) {
throw new BizException("准发票数据获取失败");
}
Invoice invoice = mapHandler.handleInvMap(invMap);
if (ObjectUtil.isEmpty(invoice)) {
throw new BizException("发票信息转换获取失败");
}
return invoice;
}
@SneakyThrows
public String before(@NotNull File file) {
try (ZipFile zipFile = new ZipFile(file)) {
//税控验证
ZipEntry entry = zipFile.getEntry("Doc_0/Attachs/original_invoice.xml");
ZipEntry eEntry = zipFile.getEntry("Doc_0/Pages/Page_0/Content.xml");
//全电更改后缀 内容不变
if (ObjectUtil.isEmpty(entry) && ObjectUtil.isNotEmpty(eEntry)) {
return ELECTRON_OFD;
}
}
return TAX_ODF;
}
}
提取器静态工厂
/**
* @author wenx
* @description 提取器构造工厂
*/
public class ConstructExtractorFactory {
/**
* extractorMap 三元组
*/
private static final Table<String, String, String> extractorMapTable = HashBasedTable.create();
static {
extractorMapTable.put(ELECTRON_OFD,
"com.extractor.OfdElecExtractor",
"com.extractor.handler.OfdElecInvExtractorMapHandler");
}
/**
* 构造提取器
*
* @param name
* @return
*/
@SneakyThrows
public static InvExtractor createExtractor(String name) {
StringBuilder clz = new StringBuilder();
for (String r : extractorMapTable.rowKeySet()) {
if (r.contains(name) && !extractorMapTable.row(r).entrySet().isEmpty()) {
extractorMapTable.row(r).keySet()
.stream()
.findAny().ifPresent(clz::append);
break;
}
}
if (CharSequenceUtil.isBlank(clz)) {
throw new BizException("获取提取器clz失败");
}
return (InvExtractor) Class.forName(clz.toString()).newInstance();
}
/**
* 构造处理器
*
* @param name
* @return
*/
@SneakyThrows
public static InvExtractorMapHandler createMapHandler(String name) {
StringBuilder clz = new StringBuilder();
for (String r : extractorMapTable.rowKeySet()) {
if (r.contains(name) && !extractorMapTable.row(r).entrySet().isEmpty()) {
extractorMapTable.row(r).values()
.stream()
.findAny().ifPresent(clz::append);
break;
}
}
if (CharSequenceUtil.isBlank(clz)) {
throw new BizException("获取转换器clz失败");
}
return (InvExtractorMapHandler) Class.forName(clz.toString()).newInstance();
}
}
测试代码
@Test
public void testHandlerChain() {
File file = new File("test.ofd");
Invoice invoice = handlerChain.process(file);
log.info("invoice:{}", invoice);
Assertions.assertNotNull(invoice);
}
-end-