Eino 中的PDFParser实现在扩展包eino-ext项目中, 目前处于alpha 阶段, 并不能较好的支持所有的PDF使用场景,
// PDFParser reads from io.Reader and parse its content as plain text.
// Attention: This is in alpha stage, and may not support all PDF use cases well enough.
// For example, it will not preserve whitespace and new line for now.
type PDFParser struct {
ToPages bool
}
用户在处理PDF文档时可以基于第三方PDF解析库自定义的PDFParser实现, 目前商业化的第三方库有unipdf等, 本文将依赖一个免费的第三方github.com/ledongthuc/pdf库实现自定义PDFParser:
-
准备一个测试的PDF, 包含多页:
image.png
- PDFParser实现, 可通过ByPages 属性设置将全文转换成一个Document, 还是按页转为多个Document:
package main
import (
"bytes"
"context"
"fmt"
"github.com/cloudwego/eino/components/document/parser"
"github.com/cloudwego/eino/schema"
"github.com/ledongthuc/pdf"
"io"
"os"
)
type PdfParser struct {
ByPages bool
}
func (pp *PdfParser) Parse(ctx context.Context, reader io.Reader, opts ...parser.Option) (docs []*schema.Document, err error) {
commonOpts := parser.GetCommonOptions(nil, opts...)
data, err := io.ReadAll(reader)
if err != nil {
return nil, fmt.Errorf("pdf parser read all from reader failed: %w", err)
}
readerAt := bytes.NewReader(data)
r, err := pdf.NewReader(readerAt, int64(readerAt.Len()))
if pp.ByPages {
totalPage := r.NumPage()
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() || p.V.Key("Contents").Kind() == pdf.Null {
continue
}
result, _ := p.GetPlainText(nil)
docs = append(docs, &schema.Document{
Content: result,
MetaData: commonOpts.ExtraMeta,
})
}
return docs, nil
}
rr, err := r.GetPlainText()
if err != nil {
return docs, err
}
pdfData, err := io.ReadAll(rr)
docs = append(docs, &schema.Document{
Content: string(pdfData),
MetaData: commonOpts.ExtraMeta,
})
return docs, nil
}
func main() {
//test()
pp := &PdfParser{ByPages: true}
f, err := os.Open("./app/eino/pdf_parser/demo.pdf")
if err != nil {
panic(err)
}
defer f.Close()
docs, err := pp.Parse(context.Background(), f, parser.WithExtraMeta(map[string]any{"test": "test"}))
fmt.Println(len(docs))
for _, v := range docs {
fmt.Println(v.Content)
fmt.Println(v.MetaData)
}
}
-
测试输出结果:
image.png

