电脑上很多有道词典的单词想转移到墨墨背单词上记忆,近日发现有前人已经做了相关的工作
来源于有道云笔记转墨墨单词
感谢作者的贡献,原文的版本不能提取词组,小小修改了一下,希望能帮助上有用的人。
# -*- coding: utf-8 -*-
"""
Created on Apr 28 2016
Extracting vocabulary from Youdao dictionary
The vocabulary text file should be code as utf-8
<INPUT>
file_in: the exported vocabulary from Youdao
</INPUT>
<OUTPUT>
file_out: the file to save the English words. Default file name is
new_words_'time'.txt ('time' is the local date)
<OUTPUT>
@author: sinit PowerDi
"""
import codecs,time
file_in = r'voc.txt'
outname = 'new_words'+'_'+time.strftime("%Y-%m-%d",time.localtime())+".txt"
file_out = outname
fs = codecs.open(file_in, 'r','utf-8')
vocabulary = fs.readlines()
fs.close()
word = []
def is_chinese(uchar):
#Judge if a unicode is Chinese
if (uchar >=u'/u4e00')&(uchar<=u'/u9fa5'):
return True
else:
return False
def is_zh (c):
x = ord (c) #ord()是chr()函数的配对函数,返回对应的ASCII数值或者Unicode数值
# Punct & Radicals
if x >= 0x2e80 and x <= 0x33ff:
return True
# Fullwidth Latin Characters
elif x >= 0xff00 and x <= 0xffef:
return True
# CJK Unified Ideographs &
# CJK Unified Ideographs Extension A
elif x >= 0x4e00 and x <= 0x9fbb:
return True
# CJK Compatibility Ideographs
elif x >= 0xf900 and x <= 0xfad9:
return True
# CJK Unified Ideographs Extension B
elif x >= 0x20000 and x <= 0x2a6d6:
return True
# CJK Compatibility Supplement
elif x >= 0x2f800 and x <= 0x2fa1d:
return True
else:
return False
for i in range(0,len(vocabulary)):
line = vocabulary[i].split()
if len(line) == 0 :
continue
if vocabulary[i].split()[0][:-1].isdigit():
newword = vocabulary[i].split()[1]
for IndexWord in vocabulary[i].split()[2:]:
if (IndexWord[0]!=u"[") and (IndexWord[-1]!=u"]"):
newword = newword+u" "+IndexWord
if is_zh(newword[0]):
continue
else:
word.append(newword)
fs = open(file_out, 'w+')
for line in word:
fs.write(line)
fs.write(u'\n')
fs.close()
print('Assignment Done!')