多个pdf合并为一个pdf
from PyPDF2 import PdfMerger
def merge_pdfs(pdf_files, output_pdf):
"""合并多个PDF文件为一个PDF文件"""
merger = PdfMerger()
for pdf in pdf_files:
try:
# 打开每个PDF文件并添加到合并器中
with open(pdf, 'rb') as file:
merger.append(file)
except FileNotFoundError:
print(f"Error: File not found at {pdf}")
except Exception as e:
print(f"Error processing file {pdf}: {e}")
try:
# 将合并后的PDF保存到输出文件
with open(output_pdf, 'wb') as output_file:
merger.write(output_file)
print(f"Merged PDF saved to {output_pdf}")
except Exception as e:
print(f"Error writing merged PDF: {e}")
# 关闭合并器
merger.close()
if __name__ == '__main__':
# 指定要合并的PDF文件路径
pdf_files = ['1.pdf', '2.pdf'] # 替换为你的PDF文件路径
output_pdf = 'merged_output.pdf' # 输出的合并后PDF文件名
# 合并PDF文件
merge_pdfs(pdf_files, output_pdf)
多张图片合并为一个pdf
from PIL import Image
def images_to_pdf(image_files, output_pdf):
"""将多张图片合并为一个PDF文件"""
# 打开第一张图片并保存为PDF
with Image.open(image_files[0]) as first_image:
if first_image.mode != 'RGB':
first_image = first_image.convert('RGB')
first_image.save(output_pdf, save_all=True, append_images=[Image.open(image).convert('RGB') for image in image_files[1:]])
print(f"Merged PDF saved to {output_pdf}")
if __name__ == '__main__':
# 指定要合并的图片文件路径
image_files = ['12.jpg', '23.jpg', '31.jpg'] # 替换为你的图片文件路径
output_pdf = 'merged_images.pdf' # 输出的合并后PDF文件名
# 合并图片为PDF
images_to_pdf(image_files, output_pdf)
单个pdf拆分为多个pdf
from PyPDF2 import PdfReader, PdfWriter
import os
def split_pdf(input_pdf, output_folder):
"""将单个PDF文件按页拆分为多个PDF文件"""
# 创建输出文件夹(如果不存在)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 读取输入PDF文件
reader = PdfReader(input_pdf)
num_pages = len(reader.pages)
for page_num in range(num_pages):
try:
# 创建一个新的PdfWriter对象
writer = PdfWriter()
# 添加当前页到PdfWriter对象中
writer.add_page(reader.pages[page_num])
# 构建输出文件名
output_pdf = os.path.join(output_folder, f'page_{page_num + 1}.pdf')
# 将当前页保存为单独的PDF文件
with open(output_pdf, 'wb') as output_file:
writer.write(output_file)
print(f"Page {page_num + 1} saved to {output_pdf}")
except Exception as e:
print(f"Error processing page {page_num + 1}: {e}")
if __name__ == '__main__':
# 指定输入PDF文件路径和输出文件夹路径
input_pdf = 'merged_output.pdf' # 替换为你的PDF文件路径
output_folder = 'output_pages' # 输出文件夹路径
# 拆分PDF文件
split_pdf(input_pdf, output_folder)
单个pdf拆分为多个图片
- 第一种,使用
PyMuPDF包
import fitz # PyMuPDF
import os
def pdf_to_images(pdf_path, output_folder, zoom_x=2.0, zoom_y=2.0):
"""将单个PDF文件按页拆分为多个图片文件"""
# 创建输出文件夹(如果不存在)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 打开输入PDF文件
pdf_document = fitz.open(pdf_path)
num_pages = pdf_document.page_count
for page_num in range(num_pages):
try:
# 获取当前页
page = pdf_document.load_page(page_num)
# 设置缩放比例
mat = fitz.Matrix(zoom_x, zoom_y)
pix = page.get_pixmap(matrix=mat)
# 构建输出文件名
output_image = os.path.join(output_folder, f'page_{page_num + 1}.png')
# 保存图片
pix.save(output_image)
print(f"Page {page_num + 1} saved to {output_image}")
except Exception as e:
print(f"Error processing page {page_num + 1}: {e}")
if __name__ == '__main__':
# 指定输入PDF文件路径和输出文件夹路径
pdf_path = 'merged_output.pdf' # 替换为你的PDF文件路径
output_folder = 'output_images' # 输出文件夹路径
# 拆分PDF文件为图片
pdf_to_images(pdf_path, output_folder, zoom_x=2.0, zoom_y=2.0)
- 第二种,使用
pdf2img
包
from pdf2image import convert_from_path
import os
from PIL import Image, ImageOps
from PIL import ImageFile
# 调整 Pillow 的解压缩炸弹限制
Image.MAX_IMAGE_PIXELS = None # 完全取消限制
# 或者设置一个较大的限制值,例如 10亿像素
# Image.MAX_IMAGE_PIXELS = 1_000_000_000
def pdf_to_images(pdf_path, output_folder, dpi=300, poppler_path=None):
"""将单个PDF文件按页拆分为多个图片文件"""
# 创建输出文件夹(如果不存在)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
try:
# 将PDF文件转换为图像列表
images = convert_from_path(pdf_path, dpi=dpi, poppler_path=poppler_path)
# 遍历每一页并将图像保存为单独的文件
for i, image in enumerate(images):
output_image = os.path.join(output_folder, f'page_{i + 1}.png')
image.save(output_image, 'PNG')
print(f"Page {i + 1} saved to {output_image}")
except Exception as e:
print(f"Error converting PDF to images: {e}")
if __name__ == '__main__':
# 指定输入PDF文件路径和输出文件夹路径
pdf_path = 'merged_images.pdf' # 替换为你的PDF文件路径
output_folder = 'output_images' # 输出文件夹路径
# 显式指定 Poppler 的路径(仅限 Windows 用户)
poppler_path = r'D:\popper\Library\bin' # 根据你的实际路径修改
# 如果你在 Linux 或 macOS 上运行,可以将 poppler_path 设置为 None
# poppler_path = None
# 拆分PDF文件为图片
pdf_to_images(pdf_path, output_folder, dpi=300, poppler_path=poppler_path)
- 第三种,其实
pdf2image
库依赖于poppler
工具集
在下载好poppler
工具集并在环境变量-系统变量
中添加了bin
路径如D:\popper\Library\bin
之后,就可以在命令行使用下面命令直接截取图片
pdftoppm -f 1 -l 1 -png F:\dxxwycj\src\123.pdf page_1
# 上述代码 -f表示开始页码,-I表示结束页码
从一个PDF文件中抽离部分页码单独组成一个新的PDF文件
import PyPDF2
from PyPDF2 import PdfReader, PdfWriter
import logging
# 设置日志配置
logging.basicConfig(filename='pdf_errors.log', level=logging.ERROR,
format='%(asctime)s - %(levelname)s - %(message)s')
def extract_pages(input_pdf_path, output_pdf_path, page_numbers):
"""从输入PDF中提取指定页码并保存到输出PDF"""
try:
# 打开原始PDF文件
with open(input_pdf_path, 'rb') as input_pdf:
# 读取PDF文件
pdf_reader = PdfReader(input_pdf)
# 获取PDF的总页数
total_pages = len(pdf_reader.pages)
# 创建一个PDF写入器对象
pdf_writer = PdfWriter()
# 循环遍历要抽取的页数
for page_number in page_numbers:
# 确保页码在有效范围内
if 1 <= page_number <= total_pages:
# 页码从1开始,但Python索引从0开始,所以需要减1
page = pdf_reader.pages[page_number - 1]
pdf_writer.add_page(page)
else:
print(f"Warning: Page number {page_number} is out of range (1 to {total_pages}).")
logging.warning(f"Page number {page_number} is out of range (1 to {total_pages}).")
# 将抽取的页面写入新的PDF文件
with open(output_pdf_path, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
print(f"Extracted pages saved to {output_pdf_path} successfully.")
except Exception as e:
print(f"Error extracting pages: {e}")
logging.error(f"Error extracting pages: {str(e)}")
# 示例用法
input_pdf_path = '12345.pdf' # 原始PDF文件路径
output_pdf_path = 'extracted.pdf' # 抽取后的PDF文件路径
page_numbers = [1,3,5] # 要抽取的页码列表(这里表示抽取第1、3、5页)
extract_pages(input_pdf_path, output_pdf_path, page_numbers)
仅去除pdf中的图片而不改变pdf结构
import fitz
def remove_images_from_pdf(input_pdf_path, output_pdf_path):
# 打开输入的 PDF 文件
doc = fitz.open(input_pdf_path)
# 遍历 PDF 中的每一页
for page in doc:
# 获取当前页面中的所有图像
image_list = page.get_images(full=True)
# 遍历所有图像
for img in image_list:
# 获取图像的 XREF 编号
xref = img[0]
# 删除图像
page.delete_image(xref)
# 保存修改后的 PDF 文件
doc.save(output_pdf_path)
# 关闭 PDF 文件
doc.close()
# 输入 PDF 文件的路径
input_pdf_path = 'input.pdf'
# 输出 PDF 文件的路径
output_pdf_path = 'input-新.pdf'
# 调用函数去除 PDF 中的图像
remove_images_from_pdf(input_pdf_path, output_pdf_path)
将一个pdf文件里面的文字、图像和表格单独提取出来
import fitz # PyMuPDF
import os
import tabula # tabula-py
def process_pdf(input_pdf_path, output_dir):
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 创建子文件夹
text_output_dir = os.path.join(output_dir, "texts")
table_output_dir = os.path.join(output_dir, "tables")
image_output_dir = os.path.join(output_dir, "images")
os.makedirs(text_output_dir, exist_ok=True)
os.makedirs(table_output_dir, exist_ok=True)
os.makedirs(image_output_dir, exist_ok=True)
try:
# 打开 PDF 文件
pdf_document = fitz.open(input_pdf_path)
print("PDF 可以解析!")
full_text = []
stop_extraction = False # 标记是否停止提取文本
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
# 提取文字部分
blocks = page.get_text("blocks") # 按块提取文本
if blocks:
cleaned_blocks = []
page_height = page.rect.height
# 根据首页或后续页面设置不同的页眉页脚过滤规则
if page_num == 0: # 首页
header_footer_ratio = (0.22, 0.85)
else: # 后续页面
header_footer_ratio = (0.1, 0.95)
for block in blocks:
x0, y0, x1, y1, text, _, _ = block # 解构文本块信息
text = text.strip() # 去除首尾空白
# 过滤页眉和页脚
if y0 > page_height * header_footer_ratio[0] and y1 < page_height * header_footer_ratio[1]:
# if "参考文献" in text: # 检测到“参考文献”后停止提取
# stop_extraction = True
# break
cleaned_blocks.append(text)
if stop_extraction:
break
# 合并段落,减少换行符
if cleaned_blocks:
paragraph = "".join(cleaned_blocks) # 使用空格连接段落内的句子
full_text.append(paragraph)
# 提取表格部分(使用 tabula-py)
try:
# 自动检测表格区域(无需手动指定 areas)
df_list = tabula.read_pdf(
input_pdf_path,
pages=f"{page_num + 1}", # 当前页码
multiple_tables=True, # 允许多个表格
lattice=False, # 如果表格有明显线条
stream=True, # 如果表格无明显线条,可以尝试 stream=True
pandas_options={'header': None} # 不将第一行作为表头
)
if df_list:
for idx, df in enumerate(df_list):
if not df.empty: # 过滤空表格
table_name = f"table_{page_num + 1}_{idx + 1}.csv"
table_file_path = os.path.join(table_output_dir, table_name)
# 将表格保存为 CSV 文件
df.to_csv(table_file_path, index=False, header=False)
print(f"成功提取并保存表格:{table_file_path}")
else:
print(f"第 {page_num + 1} 页的表格 {idx + 1} 是空的,已跳过。")
else:
print(f"第 {page_num + 1} 页未找到任何表格。")
except Exception as e:
print(f"无法提取第 {page_num + 1} 页的表格:{e}")
# 提取图像部分
images = page.get_images(full=True)
if not images:
print(f"第 {page_num + 1} 页没有找到图像。")
continue
for img_index, img in enumerate(images):
try:
# 获取图像信息
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# 改进图像命名规则(例如:Page_1_Image_1.png)
image_name = f"Page_{page_num + 1}_Image_{img_index + 1}.{image_ext}"
image_file_path = os.path.join(image_output_dir, image_name)
# 保存图像文件
with open(image_file_path, "wb") as image_file:
image_file.write(image_bytes)
print(f"成功保存图像:{image_file_path}")
except Exception as e:
print(f"无法保存第 {page_num + 1} 页的图像 {img_index + 1}:{e}")
# 清理并保存文字部分到文件
if full_text:
cleaned_text = "\n".join(full_text) # 使用双换行分隔不同段落
text_file_path = os.path.join(text_output_dir, "extracted_text.txt")
with open(text_file_path, "w", encoding="utf-8") as text_file:
text_file.write(cleaned_text)
except Exception as e:
print(f"无法解析 PDF:{e}")
# 使用示例
input_pdf = "input.pdf" # 输入 PDF 文件路径
output_directory = "output" # 输出文件夹路径
process_pdf(input_pdf, output_directory)
print("处理完成!")
上述脚本有一个问题,就是表格和部分图像提取不准
去除word里的图片存储为新的word
from docx import Document
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
def remove_images_from_word(input_path, output_path):
# 打开Word文档
doc = Document(input_path)
# 创建一个新的文档对象用于存储修改后的内容
new_doc = Document()
# 遍历原文档中的所有段落
for paragraph in doc.paragraphs:
new_paragraph = new_doc.add_paragraph()
# 尝试复制段落样式,如果样式不存在则忽略
try:
if paragraph.style:
new_paragraph.style = paragraph.style.name
except KeyError:
print(f"警告:样式 '{paragraph.style.name}' 不存在,跳过样式设置。")
# 遍历段落中的运行(runs),过滤掉图片
for run in paragraph.runs:
if "graphicData" not in str(run._r): # 检查是否包含图片
new_run = new_paragraph.add_run(run.text)
# 复制字体样式
if run.bold:
new_run.bold = True
if run.italic:
new_run.italic = True
if run.underline:
new_run.underline = True
new_run.font.name = run.font.name
new_run.font.size = run.font.size
# 遍历原文档中的所有表格
for table in doc.tables:
new_table = new_doc.add_table(rows=len(table.rows), cols=len(table.columns))
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
new_cell = new_table.cell(i, j)
new_cell.text = cell.text
# 遍历原文档中的特殊内容(如目录)
for element in doc.element.body:
if element.tag.endswith('sdt'): # 目录等特殊内容
# 检查是否为目录字段
if 'Table of Contents' in ''.join([t.text for t in element.xpath('.//w:t', namespaces=element.nsmap)]):
# 添加目录字段到新文档
add_toc(new_doc)
# 提取目录的实际文本内容
toc_text = extract_toc_text(doc)
if toc_text:
# 将目录文本作为普通段落添加到新文档中
for line in toc_text.split('\n'):
new_doc.add_paragraph(line)
# 保存新的文档
new_doc.save(output_path)
print(f"已成功保存无图片的文档到 {output_path}")
def add_toc(doc):
"""向文档中添加目录"""
paragraph = doc.add_paragraph()
run = paragraph.add_run()
fld_char1 = OxmlElement('w:fldChar')
fld_char1.set(qn('w:fldCharType'), 'begin')
instr_text = OxmlElement('w:instrText')
instr_text.set(qn('xml:space'), 'preserve')
instr_text.text = r'TOC \o "1-3" \h \z'
fld_char2 = OxmlElement('w:fldChar')
fld_char2.set(qn('w:fldCharType'), 'end')
run._r.append(fld_char1)
run._r.append(instr_text)
run._r.append(fld_char2)
def extract_toc_text(doc):
"""提取目录的实际文本内容"""
toc_text = ""
in_toc = False
for paragraph in doc.paragraphs:
if 'Table of Contents' in paragraph.text:
in_toc = True
continue
if in_toc and paragraph.style and 'TOC' in paragraph.style.name:
toc_text += paragraph.text + "\n"
else:
in_toc = False
return toc_text.strip()
# 使用示例
input_file = "input.docx" # 输入的Word文件路径
output_file = "output.docx" # 输出的无图片Word文件路径
remove_images_from_word(input_file, output_file)
从word中单独提取图片、文字、表格
import os
from docx import Document
from docx.shared import Inches
from PIL import Image
import openpyxl
def extract_text(doc, output_folder):
"""提取文字并保存为 .txt 文件"""
text_file = os.path.join(output_folder, "extracted_text.txt")
with open(text_file, "w", encoding="utf-8") as f:
for paragraph in doc.paragraphs:
f.write(paragraph.text + "\n")
print(f"文字已提取并保存到 {text_file}")
def extract_images(doc, output_folder):
"""提取图片并按页码和顺序编号命名"""
# 创建输出目录
image_folder = os.path.join(output_folder, "images")
os.makedirs(image_folder, exist_ok=True)
# 初始化变量
page_number = 1 # 当前页码
image_index_per_page = 0 # 每页的图片索引
# 定义命名空间映射
nsmap = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
}
# 获取文档中的关系字典
rels = doc.part.rels
# 遍历文档中的段落和表格
paragraphs_and_tables = list(enumerate(doc.paragraphs + list(doc.tables)))
for idx, block in paragraphs_and_tables:
if isinstance(block, str): # 如果是段落
if "_next_page" in block.text: # 假设通过特定标记区分页码
page_number += 1
image_index_per_page = 0
continue
# 如果没有 _next_page 标记,基于段落索引模拟页码递增
if idx > 0 and idx % 50 == 0: # 每 50 个段落或表格模拟一页
page_number += 1
image_index_per_page = 0
# 提取段落或表格中的图片
if hasattr(block, "_element"):
for pic in block._element.iterfind(".//pic:pic", namespaces=nsmap):
blip = pic.find(".//a:blip", namespaces=nsmap)
if blip is not None and blip.attrib.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"):
r_id = blip.attrib["{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"]
if r_id in rels:
image_part = rels[r_id]._target
# 确保 image_part 是图片对象
if hasattr(image_part, 'blob'):
image_bytes = image_part.blob
image_ext = os.path.splitext(image_part.partname)[1] # 获取图片扩展名
# 更新图片索引
image_index_per_page += 1
# 生成文件名(基于页码和图片顺序)
image_name = f"page_{page_number}_image_{image_index_per_page}{image_ext}"
# 保存图片
image_path = os.path.join(image_folder, image_name)
with open(image_path, "wb") as img_file:
img_file.write(image_bytes)
else:
print(f"警告:无法处理关系 {r_id},目标不是图片对象。")
print(f"图片已提取并保存到 {image_folder}")
def extract_tables(doc, output_folder):
"""提取表格并保存为 .xlsx 文件"""
table_folder = os.path.join(output_folder, "tables")
os.makedirs(table_folder, exist_ok=True)
for idx, table in enumerate(doc.tables):
wb = openpyxl.Workbook()
ws = wb.active
# 将表格内容写入 Excel 工作表
for row_idx, row in enumerate(table.rows):
for col_idx, cell in enumerate(row.cells):
ws.cell(row=row_idx + 1, column=col_idx + 1, value=cell.text)
# 保存表格为独立的 .xlsx 文件
table_path = os.path.join(table_folder, f"table_{idx + 1}.xlsx")
wb.save(table_path)
print(f"表格已提取并保存到 {table_folder}")
def main(input_file, output_folder):
# 打开Word文档
doc = Document(input_file)
# 创建输出文件夹
os.makedirs(output_folder, exist_ok=True)
# 提取文字
extract_text(doc, output_folder)
# 提取图片
extract_images(doc, output_folder)
# 提取表格
extract_tables(doc, output_folder)
# 使用示例
input_file = "input.docx" # 输入的Word文件路径
output_folder = "extracted_files" # 输出文件夹路径
main(input_file, output_folder)