输入格式
iterable类型的数据,单词是要split的
sentences = [['A1','A2'],[],[],....]
模型训练
# 引入 word2vec
from gensim.models import word2vec
# 引入日志配置
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 引入数据集
raw_sentences = ["the quick brown fox jumps over the lazy dogs","yoyoyo you go home now to sleep"]
# 切分词汇
sentences= [s.encode('utf-8').split() for s in sentences]
# 构建模型
model = word2vec.Word2Vec(sentences, min_count=1)
# 进行相关性比较
model.similarity('dogs','you')
模型保存
model.save('/tmp/MyModel')
# 前一组方法保存的文件不能利用文本编辑器查看但是保存了训练的全部信息,可以在读取后追加训练
model.save_word2vec_format('/tmp/mymodel.txt',binary = False)
model.save_word2vec_format('/tmp/mymodel.bin.gz',binary = True)
# 后一组方法保存为word2vec文本格式但是保存时丢失了词汇树等部分信息,不能追加训练
追加+训练
model = gensim.models.Word2Vec.load('/tmp/mymodel')
model.train(more_sentences)
模型加载
model = gensim.models.Word2Vec.load('/tmp/mymodel')