使用python自然语言处理包Gensim 调用Word2Vec进行词向量转换
import gensim
import logging
import os
#数据加载
#小样本用list
logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s",level=logging.INFO)
sentences = [["first","sentence"],["second","sentence"]]
model = gensim.models.Word2Vec(sentences,min_count=1)
#实际中使用迭代器来读取数据
class MySenctence(object):
def __init__(self,dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname,fname),"r"):
yield line.split() #分词 英文默认按照空格分割
#训练
model = gensim.models.Word2Vec(sentences,size=100,window=5,min_count=1,workers=4)
#size 词向量维度
#window 视窗大小 前后几个词
#min_count 过滤阈值 单词最低出现次数
#workers 并行化
#存储模型
model.save("file_path/to/model")
#使用模型
#最相似性
model.most_similar(positive=["women","king"],negative=["man"],topn=1)
#集合中最不相同的词语
model.doesnt_match("breakfast cereal dinner lunch".split())
#两个单词的相似度
model.similarity("woman","man")
#获取词向量
print (model["man"])
#两个集合的相似度
list1 = ['我','走','我','学校']
list2 = ['我','去','家']
list_sim = model.n_similarity(list1,list2)
print (list_sim)