word文档词条重新整理复制

将一个word文档中的词条内容,按照另一个文本文件中的顺序进行重组,生成新的文件。

from docx import Document
from docx.shared import Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement, parse_xml
from docx.oxml.ns import nsdecls, qn
from docx.table import Table
import re
from difflib import SequenceMatcher
from docx.table import _Cell, Table as DocxTable
from docx import Document as DocxDocument
from copy import deepcopy

def read_extracted_headings(filename):
    """读取extracted_headings.txt文件,解析结构"""
    with open(filename, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

def parse_structure(content):
    """解析文本结构,返回有序的关键词列表"""
    lines = content.strip().split('\n')
    structure = []
    current_part = None
    current_section = None
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # 检查是否是主要部分(第一部分、第二部分等)
        if line.startswith('第') and '部分' in line:
            current_part = line
            structure.append({
                'type': 'part',
                'title': line,
                'level': 0
            })
            current_section = None
        # 检查是否是子部分(A. B. C.等)
        elif line.startswith(('A.', 'B.', 'C.', 'D.', 'E.', 'F.', 'G.', 'H.')):
            current_section = line
            structure.append({
                'type': 'section',
                'title': line,
                'level': 1
            })
        # 其他都是词条
        else:
            structure.append({
                'type': 'term',
                'title': line,
                'level': 2,
                'part': current_part,
                'section': current_section
            })
    
    return structure

def analyze_document_styles(doc):
    """分析文档中所有段落的样式"""
    styles_found = set()
    
    print("正在分析文档样式...")
    for paragraph in doc.paragraphs:
        if paragraph.text.strip():
            style_name = paragraph.style.name
            styles_found.add(style_name)
    
    print(f"文档中发现的样式: {sorted(styles_found)}")
    return styles_found

def strip_brackets(text):
    """去除字符串中的所有括号及其内容(支持中英文括号)"""
    # 去除所有()、()及其内容
    return re.sub(r'[\((][^\))]*[\))]', '', text)

def extract_terms_from_source_enhanced(source_doc):
    """增强:提取所有关键词及其内容,内容包括段落和表格,顺序不丢失"""
    possible_heading_styles = [
        'Heading 2', '标题2', '标题 2', '样式2', 'Heading2', '2级标题', 'Title 2'
    ]
    print("正在提取源文档中的关键词...")
    styles_found = analyze_document_styles(source_doc)
    actual_heading_styles = [style for style in possible_heading_styles if style in styles_found]
    print(f"找到的标题样式: {actual_heading_styles}")
    all_paragraphs = source_doc.paragraphs
    # 找到所有标题段落的索引
    heading_indices = []
    for i, para in enumerate(all_paragraphs):
        if para.style.name in actual_heading_styles:
            heading_indices.append((i, para.text.strip()))
    print(f"总共找到 {len(heading_indices)} 个关键词")
    # 收集每个标题下的内容(段落和表格,顺序不丢失)
    terms_content = {}
    # 构建段落和表格的顺序列表
    elements = []
    para_idx = 0
    for el in source_doc.element.body:
        if el.tag.endswith('p'):
            if para_idx < len(all_paragraphs) and all_paragraphs[para_idx]._element == el:
                elements.append(('paragraph', all_paragraphs[para_idx]))
                para_idx += 1
        elif el.tag.endswith('tbl'):
            elements.append(('table', el))
    # 按标题分段收集
    for idx, (start, heading) in enumerate(heading_indices):
        end = heading_indices[idx+1][0] if idx+1 < len(heading_indices) else len(all_paragraphs)
        # 找到当前标题在elements中的位置
        # 先找到start段落在elements中的下标
        start_pos = None
        para_count = 0
        for i, (tp, obj) in enumerate(elements):
            if tp == 'paragraph':
                if para_count == start:
                    start_pos = i
                    break
                para_count += 1
        # end_pos: end段落在elements中的下标
        end_pos = None
        para_count = 0
        for i, (tp, obj) in enumerate(elements):
            if tp == 'paragraph':
                if para_count == end:
                    end_pos = i
                    break
                para_count += 1
        if start_pos is not None:
            content_elements = elements[start_pos+1:end_pos] if end_pos is not None else elements[start_pos+1:]
        else:
            content_elements = []
        terms_content[heading] = content_elements
        print(f"  └─ 为 '{heading}' 收集了 {len(content_elements)} 个内容元素")
    return terms_content

def similarity(a, b):
    """计算两个字符串的相似度"""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def fuzzy_match(target, candidates, threshold=0.6):
    """模糊匹配,返回最佳匹配和相似度,忽略括号内容"""
    best_match = None
    best_score = 0
    target_stripped = strip_brackets(target)
    for candidate in candidates:
        candidate_stripped = strip_brackets(candidate)
        score = similarity(target_stripped, candidate_stripped)
        if score > best_score and score >= threshold:
            best_score = score
            best_match = candidate
    return best_match, best_score

def copy_paragraph_with_style(source_para, target_doc):
    """复制段落,保留所有样式和格式"""
    # 创建新段落
    new_para = target_doc.add_paragraph()
    
    # 复制段落级别的样式
    new_para.style = source_para.style
    new_para.alignment = source_para.alignment
    
    # 复制段落中的所有run(文本片段)
    for run in source_para.runs:
        new_run = new_para.add_run(run.text)
        
        # 复制run级别的格式
        new_run.bold = run.bold
        new_run.italic = run.italic
        new_run.underline = run.underline
        new_run.font.size = run.font.size
        new_run.font.name = run.font.name
        
        # 复制字体颜色
        if run.font.color.rgb:
            new_run.font.color.rgb = run.font.color.rgb
            
        # 复制其他可能的格式属性
        try:
            new_run.font.highlight_color = run.font.highlight_color
        except:
            pass
    
    # 复制段落间距
    if source_para.paragraph_format.space_before:
        new_para.paragraph_format.space_before = source_para.paragraph_format.space_before
    if source_para.paragraph_format.space_after:
        new_para.paragraph_format.space_after = source_para.paragraph_format.space_after
    if source_para.paragraph_format.line_spacing:
        new_para.paragraph_format.line_spacing = source_para.paragraph_format.line_spacing
        
    return new_para


def copy_table_with_style(source_table_element, target_doc):
    """复制表格并保持所有样式(结构和内容)"""
    # 用docx的Document对象临时包装element再add_table
    temp_doc = DocxDocument()
    temp_doc._body.clear_content()
    temp_doc._body._element.append(deepcopy(source_table_element))
    table = temp_doc.tables[0]
    rows = len(table.rows)
    cols = len(table.columns)
    if rows == 0 or cols == 0:
        return None
    # 在目标文档插入新表格
    new_table = target_doc.add_table(rows=rows, cols=cols)
    # 复制内容
    for i, row in enumerate(table.rows):
        for j, cell in enumerate(row.cells):
            new_table.cell(i, j).text = cell.text
    # 复制表格样式
    new_table.style = table.style
    return new_table

def insert_empty_paragraph(target_doc):
    from docx.oxml import OxmlElement
    p = OxmlElement('w:p')
    target_doc._body._element.append(p)

def create_new_document_ordered(source_doc, structure, source_terms):
    """按照结构顺序创建新文档"""
    new_doc = Document()
    
    # 设置页面边距
    sections = new_doc.sections
    for section in sections:
        section.top_margin = Inches(1)
        section.bottom_margin = Inches(1)
        section.left_margin = Inches(1)
        section.right_margin = Inches(1)
    
    term_count = 0
    matched_terms = set()
    
    print("\n开始创建新文档...")
    print(f"源文档中找到 {len(source_terms)} 个关键词")
    print("=" * 50)
    
    for item in structure:
        if item['type'] == 'part':
            # 添加部分标题(占一页)
            print(f"\n创建部分: {item['title']}")
            heading = new_doc.add_heading(item['title'], level=0)
            heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
            new_doc.add_page_break()
            
        elif item['type'] == 'section':
            # 添加子部分标题(标题1样式)
            print(f"\n创建章节: {item['title']}")
            new_doc.add_heading(item['title'], level=1)
            
        elif item['type'] == 'term':
            # 查找匹配的词条
            target_term = item['title']
            source_term_names = list(source_terms.keys())
            
            matched_term, similarity_score = fuzzy_match(target_term, source_term_names)
            
            if matched_term and similarity_score >= 0.6:
                term_count += 1
                matched_terms.add(matched_term)
                
                print(f"[{term_count}] 正在复制词条: '{target_term}'")
                if matched_term != target_term:
                    print(f"    └─ 匹配到源词条: '{matched_term}' (相似度: {similarity_score:.2f})")
                
                # 添加词条标题(标题2样式)
                new_doc.add_heading(target_term, level=2)
                
                # 复制词条内容
                content_elements = source_terms[matched_term]
                
                if content_elements:
                    for tp, obj in content_elements:
                        if tp == 'paragraph':
                            copy_paragraph_with_style(obj, new_doc)
                        elif tp == 'table':
                            copy_table_with_style(obj, new_doc)
                    
                    print(f"    └─ 已复制 {len(content_elements)} 个内容元素")
                else:
                    print(f"    └─ 警告: 未找到内容")
                    new_doc.add_paragraph(f"[未找到 '{target_term}' 的相关内容]")
                
                insert_empty_paragraph(new_doc)
            else:
                print(f"[跳过] 未找到匹配的词条: '{target_term}'")
                if matched_term:
                    print(f"    └─ 最佳匹配: '{matched_term}' (相似度: {similarity_score:.2f}, 低于阈值 0.6)")
                
                # 仍然添加标题,但标注未找到内容
                new_doc.add_heading(target_term, level=2)
                new_doc.add_paragraph(f"[未找到 '{target_term}' 的相关内容]")
                insert_empty_paragraph(new_doc)
    
    print("=" * 50)
    print(f"文档创建完成!")
    print(f"总共处理词条: {term_count}")
    print(f"成功匹配: {len(matched_terms)}")
    return new_doc

if __name__ == '__main__':
    # 文件名
    source_docx = 'AI_zero2hero20250604.docx'
    headings_txt = 'extracted_headings.txt'
    output_docx = 'AI_zero2hero20250606.docx'

    # 读取结构
    headings_content = read_extracted_headings(headings_txt)
    structure = parse_structure(headings_content)

    # 只读方式打开源docx
    print(f"打开源文档: {source_docx}")
    source_doc = Document(source_docx)

    # 提取所有关键词及内容
    source_terms = extract_terms_from_source_enhanced(source_doc)

    # 创建新文档
    print(f"\n将生成新文档: {output_docx}")
    new_doc = create_new_document_ordered(source_doc, structure, source_terms)

    # 保存新文档
    new_doc.save(output_docx)
    print(f"新文档已保存: {output_docx}")
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容