将一个word文档中的词条内容,按照另一个文本文件中的顺序进行重组,生成新的文件。
from docx import Document
from docx.shared import Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement, parse_xml
from docx.oxml.ns import nsdecls, qn
from docx.table import Table
import re
from difflib import SequenceMatcher
from docx.table import _Cell, Table as DocxTable
from docx import Document as DocxDocument
from copy import deepcopy
def read_extracted_headings(filename):
"""读取extracted_headings.txt文件,解析结构"""
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
return content
def parse_structure(content):
"""解析文本结构,返回有序的关键词列表"""
lines = content.strip().split('\n')
structure = []
current_part = None
current_section = None
for line in lines:
line = line.strip()
if not line:
continue
# 检查是否是主要部分(第一部分、第二部分等)
if line.startswith('第') and '部分' in line:
current_part = line
structure.append({
'type': 'part',
'title': line,
'level': 0
})
current_section = None
# 检查是否是子部分(A. B. C.等)
elif line.startswith(('A.', 'B.', 'C.', 'D.', 'E.', 'F.', 'G.', 'H.')):
current_section = line
structure.append({
'type': 'section',
'title': line,
'level': 1
})
# 其他都是词条
else:
structure.append({
'type': 'term',
'title': line,
'level': 2,
'part': current_part,
'section': current_section
})
return structure
def analyze_document_styles(doc):
"""分析文档中所有段落的样式"""
styles_found = set()
print("正在分析文档样式...")
for paragraph in doc.paragraphs:
if paragraph.text.strip():
style_name = paragraph.style.name
styles_found.add(style_name)
print(f"文档中发现的样式: {sorted(styles_found)}")
return styles_found
def strip_brackets(text):
"""去除字符串中的所有括号及其内容(支持中英文括号)"""
# 去除所有()、()及其内容
return re.sub(r'[\((][^\))]*[\))]', '', text)
def extract_terms_from_source_enhanced(source_doc):
"""增强:提取所有关键词及其内容,内容包括段落和表格,顺序不丢失"""
possible_heading_styles = [
'Heading 2', '标题2', '标题 2', '样式2', 'Heading2', '2级标题', 'Title 2'
]
print("正在提取源文档中的关键词...")
styles_found = analyze_document_styles(source_doc)
actual_heading_styles = [style for style in possible_heading_styles if style in styles_found]
print(f"找到的标题样式: {actual_heading_styles}")
all_paragraphs = source_doc.paragraphs
# 找到所有标题段落的索引
heading_indices = []
for i, para in enumerate(all_paragraphs):
if para.style.name in actual_heading_styles:
heading_indices.append((i, para.text.strip()))
print(f"总共找到 {len(heading_indices)} 个关键词")
# 收集每个标题下的内容(段落和表格,顺序不丢失)
terms_content = {}
# 构建段落和表格的顺序列表
elements = []
para_idx = 0
for el in source_doc.element.body:
if el.tag.endswith('p'):
if para_idx < len(all_paragraphs) and all_paragraphs[para_idx]._element == el:
elements.append(('paragraph', all_paragraphs[para_idx]))
para_idx += 1
elif el.tag.endswith('tbl'):
elements.append(('table', el))
# 按标题分段收集
for idx, (start, heading) in enumerate(heading_indices):
end = heading_indices[idx+1][0] if idx+1 < len(heading_indices) else len(all_paragraphs)
# 找到当前标题在elements中的位置
# 先找到start段落在elements中的下标
start_pos = None
para_count = 0
for i, (tp, obj) in enumerate(elements):
if tp == 'paragraph':
if para_count == start:
start_pos = i
break
para_count += 1
# end_pos: end段落在elements中的下标
end_pos = None
para_count = 0
for i, (tp, obj) in enumerate(elements):
if tp == 'paragraph':
if para_count == end:
end_pos = i
break
para_count += 1
if start_pos is not None:
content_elements = elements[start_pos+1:end_pos] if end_pos is not None else elements[start_pos+1:]
else:
content_elements = []
terms_content[heading] = content_elements
print(f" └─ 为 '{heading}' 收集了 {len(content_elements)} 个内容元素")
return terms_content
def similarity(a, b):
"""计算两个字符串的相似度"""
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def fuzzy_match(target, candidates, threshold=0.6):
"""模糊匹配,返回最佳匹配和相似度,忽略括号内容"""
best_match = None
best_score = 0
target_stripped = strip_brackets(target)
for candidate in candidates:
candidate_stripped = strip_brackets(candidate)
score = similarity(target_stripped, candidate_stripped)
if score > best_score and score >= threshold:
best_score = score
best_match = candidate
return best_match, best_score
def copy_paragraph_with_style(source_para, target_doc):
"""复制段落,保留所有样式和格式"""
# 创建新段落
new_para = target_doc.add_paragraph()
# 复制段落级别的样式
new_para.style = source_para.style
new_para.alignment = source_para.alignment
# 复制段落中的所有run(文本片段)
for run in source_para.runs:
new_run = new_para.add_run(run.text)
# 复制run级别的格式
new_run.bold = run.bold
new_run.italic = run.italic
new_run.underline = run.underline
new_run.font.size = run.font.size
new_run.font.name = run.font.name
# 复制字体颜色
if run.font.color.rgb:
new_run.font.color.rgb = run.font.color.rgb
# 复制其他可能的格式属性
try:
new_run.font.highlight_color = run.font.highlight_color
except:
pass
# 复制段落间距
if source_para.paragraph_format.space_before:
new_para.paragraph_format.space_before = source_para.paragraph_format.space_before
if source_para.paragraph_format.space_after:
new_para.paragraph_format.space_after = source_para.paragraph_format.space_after
if source_para.paragraph_format.line_spacing:
new_para.paragraph_format.line_spacing = source_para.paragraph_format.line_spacing
return new_para
def copy_table_with_style(source_table_element, target_doc):
"""复制表格并保持所有样式(结构和内容)"""
# 用docx的Document对象临时包装element再add_table
temp_doc = DocxDocument()
temp_doc._body.clear_content()
temp_doc._body._element.append(deepcopy(source_table_element))
table = temp_doc.tables[0]
rows = len(table.rows)
cols = len(table.columns)
if rows == 0 or cols == 0:
return None
# 在目标文档插入新表格
new_table = target_doc.add_table(rows=rows, cols=cols)
# 复制内容
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
new_table.cell(i, j).text = cell.text
# 复制表格样式
new_table.style = table.style
return new_table
def insert_empty_paragraph(target_doc):
from docx.oxml import OxmlElement
p = OxmlElement('w:p')
target_doc._body._element.append(p)
def create_new_document_ordered(source_doc, structure, source_terms):
"""按照结构顺序创建新文档"""
new_doc = Document()
# 设置页面边距
sections = new_doc.sections
for section in sections:
section.top_margin = Inches(1)
section.bottom_margin = Inches(1)
section.left_margin = Inches(1)
section.right_margin = Inches(1)
term_count = 0
matched_terms = set()
print("\n开始创建新文档...")
print(f"源文档中找到 {len(source_terms)} 个关键词")
print("=" * 50)
for item in structure:
if item['type'] == 'part':
# 添加部分标题(占一页)
print(f"\n创建部分: {item['title']}")
heading = new_doc.add_heading(item['title'], level=0)
heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
new_doc.add_page_break()
elif item['type'] == 'section':
# 添加子部分标题(标题1样式)
print(f"\n创建章节: {item['title']}")
new_doc.add_heading(item['title'], level=1)
elif item['type'] == 'term':
# 查找匹配的词条
target_term = item['title']
source_term_names = list(source_terms.keys())
matched_term, similarity_score = fuzzy_match(target_term, source_term_names)
if matched_term and similarity_score >= 0.6:
term_count += 1
matched_terms.add(matched_term)
print(f"[{term_count}] 正在复制词条: '{target_term}'")
if matched_term != target_term:
print(f" └─ 匹配到源词条: '{matched_term}' (相似度: {similarity_score:.2f})")
# 添加词条标题(标题2样式)
new_doc.add_heading(target_term, level=2)
# 复制词条内容
content_elements = source_terms[matched_term]
if content_elements:
for tp, obj in content_elements:
if tp == 'paragraph':
copy_paragraph_with_style(obj, new_doc)
elif tp == 'table':
copy_table_with_style(obj, new_doc)
print(f" └─ 已复制 {len(content_elements)} 个内容元素")
else:
print(f" └─ 警告: 未找到内容")
new_doc.add_paragraph(f"[未找到 '{target_term}' 的相关内容]")
insert_empty_paragraph(new_doc)
else:
print(f"[跳过] 未找到匹配的词条: '{target_term}'")
if matched_term:
print(f" └─ 最佳匹配: '{matched_term}' (相似度: {similarity_score:.2f}, 低于阈值 0.6)")
# 仍然添加标题,但标注未找到内容
new_doc.add_heading(target_term, level=2)
new_doc.add_paragraph(f"[未找到 '{target_term}' 的相关内容]")
insert_empty_paragraph(new_doc)
print("=" * 50)
print(f"文档创建完成!")
print(f"总共处理词条: {term_count}")
print(f"成功匹配: {len(matched_terms)}")
return new_doc
if __name__ == '__main__':
# 文件名
source_docx = 'AI_zero2hero20250604.docx'
headings_txt = 'extracted_headings.txt'
output_docx = 'AI_zero2hero20250606.docx'
# 读取结构
headings_content = read_extracted_headings(headings_txt)
structure = parse_structure(headings_content)
# 只读方式打开源docx
print(f"打开源文档: {source_docx}")
source_doc = Document(source_docx)
# 提取所有关键词及内容
source_terms = extract_terms_from_source_enhanced(source_doc)
# 创建新文档
print(f"\n将生成新文档: {output_docx}")
new_doc = create_new_document_ordered(source_doc, structure, source_terms)
# 保存新文档
new_doc.save(output_docx)
print(f"新文档已保存: {output_docx}")