python读取word文档识别字段颜色，解析字段

python版本3.7.3，读取的文档格式为.docx

文中带有简单注释

看不懂的百度网盘下载直接查看，更改运行里面的py文件
网盘下载
提取码：nngw
import os
import sys
import xlrd
import codecs
import collections
import json
import io
import docx
import string
from docx import Document
from docx.shared import RGBColor #这个是docx的颜色类 

maxLength = 0
id = 1
convert_list = []
type_list = []
curPath = os.path.dirname(os.path.abspath(__file__))
# coding=utf-8
#获取文档对象
def readDocx(fileName,type):
    xlsFile = curPath + '\\'+fileName+'.docx'   #地理(葡）Respueda G .es.pt
    print("xlsFile: "+xlsFile)
    file=docx.Document(xlsFile)
    # print("段落数:"+str(len(file.paragraphs)))

    index = 0
    data = {}
    i = 0
    global id
    global maxLength
    for p in file.paragraphs:
        i = i + 1
        if i <= 1:  #跳过第一行
            continue
        if p.text == "" or (not p.text.strip()):
            continue
        # print("读取第 "+str(i)+" 行，文件名："+fileName+" ID："+str(id)+"  内容:"+p.text)
        if index == 0: #提取题目
            # print(p.text.find("-"),"题的内容是：", p.text)
            length = len(p.text)
            idx = p.text.find("Número")
            if idx != -1 and idx < 2:
                idx = idx + len("Número") + 1
                # print("Número: "+str(idx)+"   text: "+p.text)
                p.text = p.text[idx:(length)]
                # print("Número: "+str(idx)+"   text: "+p.text)
            
            indexStr = "-" #分隔符
            if p.text.find(indexStr) == -1:
                indexStr = "."
                if p.text.find(indexStr) == -1:
                    indexStr = " "
            # print("题的内容是：", p.text)
            idx = p.text.index(indexStr)+len(indexStr)
            length = len(p.text)
            if length > maxLength:
                maxLength = length
                # print(id,"最大字符数",maxLength)
            # print(str(idx)+str(length)+"第"+str(id)+"题的内容是："+p.text)
            questionAndsubType = p.text[idx:(length)]
            questionAndsubTypeList = questionAndsubType.split("|")

            data["question"] = questionAndsubTypeList[0] #题目
            
            # if len(questionAndsubTypeList) > 1 : #类型
                # subType = questionAndsubTypeList[1].replace("\n", "")
                # print("---类型---",type_list.count(subType))
                # if type_list.count(subType) <= 0 :
                    # type_list.append(subType)

            data["subType"] = type#escape(subType)  #类型
        else:   #提取选项，以及正确答案
            # print("第"+str(id)+"题    选项"+ str(index) +"是："+p.text)
            length = len(p.text)
            for n in p.runs:
                rgb = str(n.font.color.rgb) #读取段落颜色
                # print("runs"+rgb)
                if rgb == "00FF00":
                    # print("正确答案： ",index)
                    data["rightIndex"] = index
            #删除段落中不必要文字
            idx = p.text.find("(Direito)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Correcta)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Right)")
            if idx != -1:
                p.text = p.text[0:idx]
                
            idx = p.text.find("(Correct)")
            if idx != -1:
                p.text = p.text[0:idx]
            #删除段落中不必要文字
            
            data["option"+str(index)] = p.text

        index = index + 1

        if index >= 5:
            data["_id"] = id
            # print("data: "+str(data))
            convert_list.append(data)
            index = 0
            id = id + 1
            data = {}

def writeDocx(fileList,name):
    global id
    global convert_list
    global type_list
    id = 1
    convert_list = []
    type_list = []
    
    for p in fileList:
        readDocx(p["path"],p["type"])
    #题库
    jsonPath = os.path.join(curPath,"topic",name+".txt") #写入路径
    dirname = os.path.dirname(jsonPath)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    with io.open(jsonPath, 'w', encoding='utf-8') as f:     #按照对应路径写入
        f.write(json.dumps(convert_list, ensure_ascii=False, indent=4, sort_keys=True))

def main():
    en_fileList =  [{"path":"en_us_topic\\地理(英）Respueda G .es.en",          "type":"World"},
                    {"path":"en_us_topic\\科学与技术（英）",                    "type":"Technology"},
                    {"path":"en_us_topic\\历史（英)Resupeda H.es.en",           "type":"History"},
                    {"path":"en_us_topic\\艺术和文学（英）Respueda A&L.es.en",  "type":"ArtAndLiterature"},
                    {"path":"en_us_topic\\娱乐（英）Respueda E.es.en",           "type":"Fashion"},
                    {"path":"en_us_topic\\运动（英)Respueda  D.es.en",          "type":"Sports"}]
    en_name = "en_us_topic"

    es_fileList =  [{"path":"es_es_topic\\地理(西）Respueda G ",                "type":"World"},
                    {"path":"es_es_topic\\科学与技术(西）Respueda C&T",            "type":"Technology"},
                    {"path":"es_es_topic\\历史（西)Resupeda H",                 "type":"History"},
                    {"path":"es_es_topic\\艺术和文学（西）Respueda A&L",        "type":"ArtAndLiterature"},
                    {"path":"es_es_topic\\娱乐（西）Respueda E",                 "type":"Fashion"},
                    {"path":"es_es_topic\\运动（西)Respueda  D",                "type":"Sports"}]
    es_name = "es_es_topic"

    pt_fileList =  [{"path":"pt_br_topic\\地理(葡）Respueda G .es.pt",          "type":"World"},
                    {"path":"pt_br_topic\\科学与技术（葡）",                    "type":"Technology"},
                    {"path":"pt_br_topic\\历史（葡)Resupeda H.es.pt",           "type":"History"},
                    {"path":"pt_br_topic\\艺术和文学（葡）Respueda A&L.es.pt",  "type":"ArtAndLiterature"},
                    {"path":"pt_br_topic\\娱乐（葡）Respueda E.es.pt",           "type":"Fashion"},
                    {"path":"pt_br_topic\\运动（葡)Respueda  D.es.pt",          "type":"Sports"}]
    pt_name = "pt_br_topic"

    writeDocx(pt_fileList,pt_name)
    writeDocx(es_fileList,es_name)
    writeDocx(en_fileList,en_name)
    
main()
python读取word文档识别字段颜色，解析字段

python读取word文档识别字段颜色，解析字段

python版本3.7.3，读取的文档格式为.docx

文中带有简单注释

相关阅读更多精彩内容

友情链接更多精彩内容