CS224n 2019 homework1

$\color{red}{个人实现，仅供参考}$

1.1 Implement distinct_words

def distinct_words(corpus):
    corpus_words = [word for doc in corpus for word in doc]
    corpus_words = sorted(list(set(corpus_words)))
    num_corpus_words = len(corpus_words)

    return corpus_words, num_corpus_words

1.2 Implement compute_co_occurrence_matrix

def compute_co_occurrence_matrix(corpus, window_size=4):
    words, num_words = distinct_words(corpus)

    word2Ind = dict(zip(words, range(num_words)))

    M = np.zeros((num_words, num_words))
    for doc in corpus:
        doc_len = len(doc)
        for i in range(doc_len):
            # 窗口下界
            low = max(0, i - window_size)
            # 窗口上界，超尾
            upper = min(doc_len - 1, i + window_size) + 1
            idx1 = word2Ind[doc[i]]
            for j in range(low, upper):
                if i != j:
                    idx2 = word2Ind[doc[j]]
                    M[idx1][idx2] += 1

    return M, word2Ind

1.3 Implement reduce_to_k_dim

def reduce_to_k_dim(M, k=2):
    n_iters = 10
    print("Running Truncated SVD over %i words..." % (M.shape[0]))

    #  定义SVD变换
    svd = TruncatedSVD(n_components=k, n_iter=n_iters)
    #  将SVD应用于M
    M_reduced = svd.fit_transform(M)

    print("Done.")

    return M_reduced

1.4 Implement plot_embeddings

def plot_embeddings(M_reduced, word2Ind, words):
    for word in words:
        # word embeddings映射到二维空间
        x, y = M_reduced[word2Ind[word]][:2]
        plt.scatter(x, y, marker='x', c='red')
        plt.text(x, y, word)
        
    plt.show()

1.5 Co-Occurrence Plot Analysis

def test_plot_embeddings():
    reuters_corpus = read_corpus()
    M_co_occurrence, word2Ind_co_occurrence = \
        compute_co_occurrence_matrix(reuters_corpus)
    M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)
    # Rescale (normalize) the rows to make them each of unit-length
    M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1)
    # newaxis的作用是将一维数组扩展成二维，从而实现broadcasting
    M_normalized = \
        M_reduced_co_occurrence / M_lengths[:, np.newaxis]  # broadcasting

    words = ['barrels', 'bpd', 'ecuador',
             'energy', 'industry', 'kuwait', 'oil',
             'output', 'petroleum', 'venezuela']
    plot_embeddings(M_normalized, word2Ind_co_occurrence, words)