import uuid
import docx
from docx import Document
from docx.text.paragraph import Paragraph
# document = Document('深圳证券交易所股票上市规则.docx').add_
def catalogue_get(doc):
# global heading, parent_id
docx = Document(doc)
lastest_heading = 0
record = ['1'] # 记录目录结构
point = '.'
dataContent = '' # 内容数据
data = []
header = 0
for paragraph in docx.paragraphs:
# print(paragraph.text)
info = {}
data.append(info)
header += 1
if paragraph.style.name[:7] == 'Heading':
if header == 1:
lines = f'<div><center><h1>{paragraph.text}</h1></center></div> '
dataContent += lines
# print(lines)
this_heading = int(paragraph.style.name[-1])
if this_heading < 4:
result = ''.join(record) + point
if this_heading == 1 and lastest_heading == 0:
heading = ''.join(record) + '.'
# print(f'heading:{heading}')
else:
if this_heading > lastest_heading:
record.append('1')
parent_id = '.'.join(record[:-1]) + '.'
# info['parent_id'] = parent_id
# info.update({'id': heading, 'text': paragraph.text,'parent_id':parent_id})
# print(f'record:{record}', f'parent_id:{parent_id}')
elif this_heading == lastest_heading:
record[-1] = str(int(record[-1]) + 1)
parent_id = '.'.join(record[:-1]) + '.'
# info['parent_id'] = parent_id
# info.update({'id': heading, 'text': paragraph.text, 'parent_id': parent_id})
# print(record, f'parent_id:{parent_id}')
# print(f'record[-1]:{record}')
else:
record[this_heading - 1] = str(int(record[this_heading - 1]) + 1)
record[this_heading:] = []
parent_id = 0
# info['parent_id'] = 0
# info.update({'id': heading, 'text': paragraph.text, 'parent_id': 0})
# print(record, 'parent_id:0')
heading = '.'.join(record) + point # 显示一段目录
lastest_heading = this_heading
# print(heading, paragraph.text, paragraph.style.name, sep=' ')
# info['id'] = heading
# info['text'] = paragraph.text
headingList = heading.split('.')
# headingStr = ''
parent_id = '.'.join(headingList[:-2]) + '.'
# print(f'parent_id:{parent_id}')
# print(f'headingList:{headingList}')
# connection = str(uuid.uuid3(uuid.NAMESPACE_DNS, paragraph.text)).replace('-', '')
# , 'connection': connection
infos = {'id': heading, 'text': paragraph.text, 'parent_id': parent_id}
data.append(infos)
size = int(str(paragraph.style.name).replace('Heading ', ''))+2
lines = f"<h{size} id='{heading}'>{paragraph.text}</h{size}>"
dataContent += lines
# print(lines)
# print(heading, paragraph.text,parent_id,sep=' ')
else:
if header == 1:
lines = f'<div><center><h1>{paragraph.text}</h1></center></div> '
dataContent += lines
# print(lines)
else:
lines = f'<p>{paragraph.text}</p>'
dataContent += lines
# print(lines)
data = [i for i in data if i]
return dataContent
# return data
# print(data)
# data = [i for i in data if i]
# data[0]['parent_id'] = 0
# return data
# path = 'D://Users/Desktop/new/深圳证券交易所创业板股票上市规则(2020年修订).docx'
path = '深圳证券交易所股票上市规则.docx'
# data = catalogue_get(path)
data = catalogue_get(path)
print(data)
目录树
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 最近看到vscode tree生成目录树结构插件挺好玩的,自己琢磨实现了一个简单版本,支持文件,文件夹过滤 最终效...
- 最近有个 task,是把其它研发组的 React JS 代码整合到自家项目里。先从度娘效率云上面 clone 了下来~~