抽取word文档中的所有标题

from docx import Document

def analyze_document_styles(docx_file_path):
    """
    分析文档中所有段落的样式,帮助确定正确的样式名称
    """
    try:
        doc = Document(docx_file_path)
        styles_found = set()
        
        print("文档中发现的所有样式:")
        for paragraph in doc.paragraphs:
            if paragraph.text.strip():  # 只显示有内容的段落
                style_name = paragraph.style.name
                styles_found.add(style_name)
                print(f"样式: '{style_name}' - 内容: '{paragraph.text[:50]}...'")
        
        print(f"\n共发现 {len(styles_found)} 种样式:")
        for style in sorted(styles_found):
            print(f"- {style}")
            
    except Exception as e:
        print(f"分析文档时发生错误: {str(e)}")

def extract_heading1_enhanced(docx_file_path, txt_output_path):
    """
    增强版提取函数,支持多种可能的一级标题样式名称
    """
    try:
        doc = Document(docx_file_path)
        heading1_list = []
        
        # 可能的一级标题样式名称
        possible_heading1_styles = [
            'Heading 2', 
            '标题2', 
            '标题 2',
            '样式2',
            'Heading2',
            '2级标题',
            'Title 2'
        ]
        
        for paragraph in doc.paragraphs:
            # 检查是否匹配任何可能的一级标题样式
            if paragraph.style.name in possible_heading1_styles:
                heading_text = paragraph.text.strip()
                if heading_text:
                    heading1_list.append(heading_text)
                    print(f"找到一级标题: {heading_text}")
        
        # 保存到txt文件
        with open(txt_output_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write("提取的一级标题:\n")
            txt_file.write("=" * 50 + "\n\n")
            for i, heading in enumerate(heading1_list, 1):
                txt_file.write(f"{i}. {heading}\n")
        
        print(f"\n成功提取了 {len(heading1_list)} 个一级标题")
        print(f"已保存到: {txt_output_path}")
        
        return heading1_list
        
    except Exception as e:
        print(f"提取标题时发生错误: {str(e)}")
        return []

# 使用方法
if __name__ == "__main__":
    input_file = "AI_zero2hero20250604.docx"
    output_file = "extracted_headings.txt"
    
    # 首先分析文档样式(可选)
    print("正在分析文档样式...")
    analyze_document_styles(input_file)
    
    print("\n" + "="*60)
    print("正在提取一级标题...")
    
    # 提取一级标题
    headings = extract_heading1_enhanced(input_file, output_file)

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容