为了不在某个平台开通会员,充钱,我借鉴了github某位大神的代码https://github.com/python-fan/pdf2word,成功实现pdf转换为word,各位可以参考借鉴下。
1.在使用该代码时,需要安装对应的第三方库包,pdf转换word(即提取pdf文字写入到word中)需要两个库包:pdfminer3k和python-docx;需要提取pdf中的图片,需要 pymupdf该库名进行操作提取
2.实现代码如下:
import os
from configparserimport ConfigParser
from ioimport StringIO
from ioimport open
from concurrent.futuresimport ProcessPoolExecutor
from pdfminer.pdfinterpimport PDFResourceManager
from pdfminer.pdfinterpimport process_pdf
from pdfminer.converterimport TextConverter
from pdfminer.layoutimport LAParams
from docximport Document
import fitz
import time
import re
#提取图片函数
def pdf2pic(path, pic_path):
'''# 从pdf中提取图片 ,param path: pdf的路径 ,param pic_path: 图片保存的路径
'''
# 使用正则表达式来查找图片
checkXO =r"/Type(?= */XObject)"
checkIM =r"/Subtype(?= */Image)"
# 打开pdf
doc = fitz.open(path)
# 图片计数
imgcount =0
lenXREF = doc.xrefLength() #最新fitz库是没有doc._getXrefLength()
# 打印PDF的信息
print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF -1))
# 遍历每一个对象
for iin range(1, lenXREF):
# 定义对象字符串
text = doc.xrefObject(i) #最新fitz库是没有getObjectString()
isXObject = re.search(checkXO, text)
# 使用正则表达式查看是否是图片
isImage = re.search(checkIM, text)
# 如果不是对象也不是图片,则continue
if not isXObjector not isImage:
continue
imgcount +=1
# 根据索引生成图像
pix = fitz.Pixmap(doc, i)
# 根据pdf的路径生成图片的名称
new_name = path.replace('\\', '_') +"_img{}.png".format(imgcount)
new_name = new_name.replace(':', '')
# 如果pix.n<5,可以直接存为PNG
if pix.n <5:
pix.writePNG(os.path.join(pic_path, new_name))
# 否则先转换CMYK
else:
pix0 = fitz.Pixmap(fitz.csRGB, pix)
pix0.writePNG(os.path.join(pic_path, new_name))
pix0 =None
# 释放资源
pix =None
print("提取了{}张图片".format(imgcount))
#读取pdf文件函数
def read_from_pdf(file_path):
with open(file_path, 'rb')as file:
resource_manager = PDFResourceManager()
return_str = StringIO()
lap_params = LAParams()
device = TextConverter(
resource_manager, return_str, laparams=lap_params)
process_pdf(resource_manager, device, file)
device.close()
content = return_str.getvalue()
return_str.close()
return content
#保存word文件函数
def save_text_to_word(content, file_path):
doc = Document()
for linein content.split('\n'):#以换行符为分割
paragraph = doc.add_paragraph()
paragraph.add_run(remove_control_characters(line))
doc.save(file_path)
#移除字符
def remove_control_characters(content):
mpa =dict.fromkeys(range(32))
return content.translate(mpa)
#直接调用read_from_pdf(pdf_file_path)函数和save_text_to_word(content, word_file_path)函数
def pdf_to_word(pdf_file_path, word_file_path):
content = read_from_pdf(pdf_file_path)
save_text_to_word(content, word_file_path)
#写个主函数,可以读取配置文件信息和使用多进程处理pdf文件
def main():
#读取配置文件信息
config_parser = ConfigParser()
config_parser.read('config.cfg')
config = config_parser['default']
#多进程处理任务,可以同时执行多个pdf文件进行转换为word文件
tasks = []
with ProcessPoolExecutor(max_workers=int(config['max_worker']))as executor:
for filein os.listdir(config['pdf_folder']):
extension_name = os.path.splitext(file)[1]
if extension_name !='.pdf':
continue
file_name = os.path.splitext(file)[0]
pdf_file = config['pdf_folder'] +'/' + file
word_file = config['word_folder'] +'/' + file_name +'.docx'
print('正在处理: ', file)
result = executor.submit(pdf_to_word, pdf_file, word_file)
tasks.append(result)
while True:
exit_flag =True
for taskin tasks:
if not task.done():
exit_flag =False
if exit_flag:
print('完成')
exit(0)
3.执行代码运行
if __name__ =='__main__':
fpath=os.getcwd()
# pdf_to_word(fpath+'\\pdf\\test.pdf',fpath+'\\word\\test.docx')
pdf2pic(fpath+'\\pdf\\test.pdf',fpath+'\\word')