1.1 Implement distinct_words
def distinct_words(corpus):
corpus_words = [word for doc in corpus for word in doc]
corpus_words = sorted(list(set(corpus_words)))
num_corpus_words = len(corpus_words)
return corpus_words, num_corpus_words
1.2 Implement compute_co_occurrence_matrix
def compute_co_occurrence_matrix(corpus, window_size=4):
words, num_words = distinct_words(corpus)
word2Ind = dict(zip(words, range(num_words)))
M = np.zeros((num_words, num_words))
for doc in corpus:
doc_len = len(doc)
for i in range(doc_len):
# 窗口下界
low = max(0, i - window_size)
# 窗口上界,超尾
upper = min(doc_len - 1, i + window_size) + 1
idx1 = word2Ind[doc[i]]
for j in range(low, upper):
if i != j:
idx2 = word2Ind[doc[j]]
M[idx1][idx2] += 1
return M, word2Ind
1.3 Implement reduce_to_k_dim
def reduce_to_k_dim(M, k=2):
n_iters = 10
print("Running Truncated SVD over %i words..." % (M.shape[0]))
# 定义SVD变换
svd = TruncatedSVD(n_components=k, n_iter=n_iters)
# 将SVD应用于M
M_reduced = svd.fit_transform(M)
print("Done.")
return M_reduced
1.4 Implement plot_embeddings
def plot_embeddings(M_reduced, word2Ind, words):
for word in words:
# word embeddings映射到二维空间
x, y = M_reduced[word2Ind[word]][:2]
plt.scatter(x, y, marker='x', c='red')
plt.text(x, y, word)
plt.show()
1.5 Co-Occurrence Plot Analysis
def test_plot_embeddings():
reuters_corpus = read_corpus()
M_co_occurrence, word2Ind_co_occurrence = \
compute_co_occurrence_matrix(reuters_corpus)
M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)
# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1)
# newaxis的作用是将一维数组扩展成二维,从而实现broadcasting
M_normalized = \
M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting
words = ['barrels', 'bpd', 'ecuador',
'energy', 'industry', 'kuwait', 'oil',
'output', 'petroleum', 'venezuela']
plot_embeddings(M_normalized, word2Ind_co_occurrence, words)