记录词向量训练过程,备忘(系统ubuntu16、python2.7)
涵盖内容:python rar解压、大文件分解、HDF5文件操作、文本预处理(文本编码、分词处理)、多进程、gensim操作、
1. 压缩包提取指定后缀文件
需要安装ubuntu安装rar解压软件sudo apt-get install unrar
import re, rarfile
def unrar_extract_txt(rar_path, save_path, reg='\.txt|\.TXT'):
"""
uncompressed rar files and extract txt
rar_path : where rar file exist
save_path : save extracted files
reg : for re.complie use
"""
regex = re.compile(reg)
for fname in os.listdir(rar_path):
rf = rarfile.RarFile(rar_path+fname)
for f in rf.infolist():
if regex.search(f.filename):
rf.extract(f, save_path)
2. 大文件切割
def slice_preprocessing(file, save_path, chars=1e7):
"""
slice one big utf-8 file into small file
file : big files' path
save_path : directory to save small files
chars : chars numbers, each small file contains
"""
f = open(file, 'r')
data = f.read()
f.close()
data = data.decode('utf8','ignore')
data_l = len(data)
iters = int(data_l / chars) + 1
for n in xrange(iters):
start = int(n*chars)
end = int((1+n)*chars)
if end > data_l:
end = data_l
tem = data[start:end]
tem = tem.encode('utf8')
small_filename = save_path+os.path.split(file)[1]+'_'+str(start)+'_'+str(end)
f = open(samll_filename, 'w')
f.write(tem)
f.close()
3. 遍历文件目录下文件,且改名
def prepare_files(search_path):
""" search_path : dir where file exist """
regex = re.compile('[\.txt*|\.TXT]')
file_count = 0
for root, dirs, files in os.walk(search_path, topdown=True):
for name in files:
if regex.search(name):
try: # shutil.copy() os.remove()
new_name = '%d.txt' % (file_count)
os.rename(os.path.join(root, name), os.path.join(root, new_name))
file_count += 1
except:
continue
print('total renamed files : %d' % (file_count))
return file_count
4. 文件编码检测,且预处理
import jieba, re, random, chardet
def processing(file_path):
"""
detect encoding and switch into unicode, remove some useless chars, slice context into numpy array
file_path : dir where file exist
return :
detect/switch/remove/slice success: (file name, ndarray(N, 1e4))
eg. for 23222.txt ('23222', ndarray(37, 1e4))
any step fail : return (-1, file name)
each element of numpy array is a word
"""
with open(file_path, 'r') as txt_f:
context = txt_f.read()
con_len = len(context)
# if detect_coding['confidence'] <= 0.95, keep trying (max 5 times)
for n in xrange(5):
try: # if length less then 1000 ignore this file
rand_num = random.randint(0, con_len-1000)
detect_coding = chardet.detect(context[rand_num:rand_num+1000])
except Exception as e:
return (-1, os.path.split(file_path)[1][:-4])
if detect_coding['confidence'] < 0.95:
if n == 4:
return (-1, os.path.split(file_path)[1][:-4])
else:
continue
else:
if detect_coding['encoding'] == 'BG2312':
detect_coding['encoding'] = 'GBK'
# ignore illegal chars
context = context.decode(detect_coding['encoding'],'ignore')
# replace useless chars
context = context.replace(' ', '')
context = context.replace('\n', '')
context = context.replace('\r', '')
context = context.replace(u'\u3000', '') # u'\u3000' 和 '\xe3\x80\x80' is the same character ' ' looks like space but a little longer then space
context = re.sub('\d+', '', context)
context = re.sub('[a-zA-z]','',context)
context = re.sub(u'[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', '', context)
# jieba cut , and return a list contain utf8 words
context_cut = jieba.cut(context)
context_list = list(context_cut)
context_list = ' '.join(context_list).encode('utf8').split()
# prepare numpy array shape (N, 10000)
context_len = len(context_list)
N = context_len // 10000
if N == 0: # if N less then 10000 ignore this file
return (-1, os.path.split(file_path)[1][:-4])
context_list = context_list[:N * 10000]
context_array = np.array(context_list).reshape((N, 10000))
return (os.path.split(file_path)[1][:-4], context_array)
5. 多进程,完成数据预处理
如果一次多进程处理,内存会不够用。所以完成一个generator,将文件一批一批的feed给预处理函数,多次使用多线程处理。数据处理完成后最好保持在HDF5文件中,总大小接近400G.
多进程pool.map(prepare_data, files)
中的prepare_data函数必须是在顶层函数,且只接受一个参数(试验过lambda 也无法使用)
def multi_processing_data(files_path_feeder, save_path, file_count, batch_size=5000):
"""
multi-processing, execute prepare_data(), save output into hdf5
files_path_feeder: a generator return files path by batch
save_path: hdf5 file path like ' ./output.hdf5'
file_count: total files to be prepare
batch_size: how many files to be prepared once
"""
ck_num = int(file_count / batch_size)
iter_times = 0
rows = 0
illegal_files = 0
start_p = time.time()
logging.info('start prepare_data')
logging.info('-------------------------------------------------------------')
for files in files_path_feeder:
start_l = time.time()
pool = Pool(45)
output = pool.map(prepare_data, files)
pool.close()
pool.join()
illegal_files += len([n for n in output if n[0] == -1]) # count illegal files
output = [n for n in output if n[0] != -1] # drop illegal_file
for n in output: # count rows of corpus
rows += n[1].shape[0]
# write into hdf5
output = dict(output)
f = h5py.File(save_path, 'a')
for key, value in output.iteritems():
f.create_dataset(key, data = value)
f.close()
del output
# monitor processing
percentage = (iter_times+1) / (ck_num+1)
done = int(100 * percentage)
undone = 100 - done
iter_times += 1
logging.info('iteration %d th, multi-processing time: %0.2f s' % (iter_times, time.time()-start_l))
logging.info(''.join(['#']*done + ['.']*undone) + (' %0.2f'%(percentage*100))+'%')
logging.info('-------------------------------------------------------------')
logging.info('total files %d , illegal %d, effective %d (%0.2f) ' % (
file_count, illegal_files, file_count-illegal_files,
(file_count-illegal_files) / file_count))
logging.info('total rows %d , each row contains 10000 word(coding utf-8)' % (rows))
logging.info('done prepare_data, processing time: %0.2f s' % (time.time()-start_p))
6. HDF5文件使用指南:
# open
f = h5py.File(corpus_path, 'r')
f.close()
r :read,
a :add if not exit create one,
r+:read and write,
w :create a new one and write
# write
f.create_dataset(key, data = value)
# read
type of value is h5py._hl.dataset.Dataset
f.iterkeys() # py2 only
f.itervalues() # py2 only
f.iteritems() # py2 only
f.itervalues() # py2 only
f.items() # Get (name, value)
f.keys()
f.values()
7. gensim使用,以及训练词向量
A. 模型参数意义 github gensim word2vec.py,词向量训练资料
model = gensim.models.Word2Vec(sentences, size=300, alpha=0.025, window=5, min_count=10, max_vocab_size=None, sample=1e-3, seed=1, workers=45, min_alpha=0.0001, sg=0, hs=0, negative=20, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=1e4)
B. 模型保存与加载
from gensim.keyedvectors import KeyedVectors
# save
model.save(fname) # 只有这样存才能继续训练!
model.wv.save_word2vec_format(outfile + '.model.bin', binary=True) # C binary format 磁盘空间比上一方法减半
model.wv.save_word2vec_format(outfile + '.model.txt', binary=False) # C text format 磁盘空间大,与方法一样
# load
model = gensim.models.Word2Vec.load(fname)
word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)
word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True)
# 最省内存的加载方法
model = gensim.models.Word2Vec.load('model path')
word_vectors = model.wv
del model
word_vectors.init_sims(replace=True)
C. 继续训练模型
# Update the model‘s neural weights from a sequence of sentences. Below arguments MUST be provided:
# 1. total_examples (rows) or total_words(rows * colunms) .
# (If the corpus is the same as was provided to build_vocab(), the count of examples in that corpus will be available in the model's
# corpus_count property.)
# 2. epochs In the common and recommended case, where train()
# is only called once, the model's cached iter value should be supplied as epochs value.
model.train(sentences,total_examples=new_model.corpus_count, epochs=1)```
#####D. 只使用词向量查询功能
word_vectors = model.wv # 其实就是 model. KeyedVectors
del model
word_vectors.init_sims(replace=True)执行
word_vectors.init_sims(replace=True) ```,将节约大量内存。KeyedVectors类中的函数例如 most_similar,使用时,会计算词向量wv.sy0所有词norm后的结果保存在wv.sy0norm ,也就是内存中保存两份词向量,一份norm化的,一份没有norm化的。以上语句的意思是只保留norm化的词向量,释放内存。(norm化:按每个向量归一化,也可以理解为按行归一化。gensim 源码中 norm 部分代码
E. 模型方法使用
# 词之间的关系 老板+女人-男人=老板娘
int: model.wv.most_similar(['老板', '女人'],['男人'], topn=1)
out: 老板娘
# 一句话中不合时宜的词
int: model.doesnt_match('三人 造谣 萌娘 诋毁 四川 凉山 牺牲 民警 被 行政 拘留'.split())
out: 萌娘
# 词之间的相似度
int: model.similarity('呆萌', '可爱')
out: 0.672380870938
# 与某些词最相近的词
int: model.wv.most_similar(['二次元'], topn=10)
out:动漫 亚文化 御宅族 网游 cosplay 宅舞 日漫 萌系 耽美 仙侠```
#####F. 词相关数据
词向量、index、词频等等
model.wv["limited"] # word vector
model.wv.vocab["limited"].count # frequency
df = []
for word, vocab_obj in model.wv.vocab.items():
#Do something with vocab_obj.count
df.append([word, vocab_obj.index, vocab_obj.count])```