Python：用Word2Vec 和 sklearn 对IMDB评论进行分类训练

之前一直做的是目标跟踪上的东西，这几天在看这本书又看到NLP，两者均作为对数据序列的处理，应该是有共通点的，于是就简单摸索了一下。

在NLP建立由词到向量的映射最简单的方法是bag of words，粗暴直接，没准还效果拔群。

但是bag of words 没法表达出词与词之间联系与相似程度，功能上还是有些粗糙，所以就考虑用Word2Vec将词映射到向量空间，再进行分类训练。

这次的工作主要就是一些书上教程和word2vec的结合

需要用到的module

sklearn
nltk
gensim

不是什么偏门模块，直接anaconda里面install就行，conda没有就pip

数据来源

爬网页什么的就算了，我也搞不了那么大的，直接找现成的。（我是拒绝承认我是因为知道了这个数据才做这项工作的）。
这个数据集里面把IMDB评分在5以上的都视作positive sample，5以下的视作 negative sample

数据预处理

借用了nltk的 stopwords 集，就是那些像 i, you, is 之类的没啥营养哪都出现频率还死高的词。用来把他们从训练集中清除。
pyprind看个进度
还有个对符号表情的提取，比如:-)但是这一套东西，不管是stopwords还是表情符号，都是基于英语环境的，对于中文还是不行，或者是有类似的成果只是我孤陋寡闻。（我就对这种 (～￣▽￣)～还有这种 (눈‸눈) 搞出通用的识别方法的难度表示关切）

把原始的txt整理成为csv

import pyprind
import pandas as pd
import os
from nltk.corpus import stopwords
import re
import numpy as np


stop = stopwords.words('english')


def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
                token = tokenizer(text=txt)
            df = df.append([[token, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv')

生成基于此数据集的word2vec模型

import pyprind
import gensim.models
import re

inpath = 'movie_data.csv'
outpath = 'wordVectTrainResult'
pbar = pyprind.ProgBar(100000)
class csvStream(object):
    def __init__(self,path):
        self.path=path
    def __iter__(self):
        with open(self.path, 'r',) as csv:
            next(csv)  # skip header
            for line in csv:
                text = line[4:-3]
                text = re.sub('[\'\"\[\]\d\b]','',text)   
                while (text[0] == ',') or (text[0] == ' '):
                    text = text[1:]
                pbar.update()
                yield text.split(', ')


lineIterator = csvStream(inpath)
model = gensim.models.Word2Vec()
model.build_vocab(lineIterator)
print('vocabulary building finished, start training...')
model.train(lineIterator,total_examples=model.corpus_count,epochs=1)
model.save(outpath)

模型被保存到了当前目录的wordVectTrainResult文件。想用的时候再load就行。

分类器训练

可怜我的小笔记本，跑grid来选取最优的参数肯定是不行了，所以就采用了SGD miniBatch的训练方式。分类器在sklearn里面有现成的，直接拿来用就可以。
当前面临的一个最重要的问题是怎么把基于词的word2vec映射数据对应到训练数据的以句（段）为单位的映射数据。各个样本的长度不一，所以没法通过堆叠的方式来转化成训练数据。
最粗暴的方法是以每条样本句子（段落）各个单词的平均值来作为整个句子的在单词空间的向量。查了一下网上大佬们的说法，这里的这位大佬提出在word2vec词库的基础上用bag of words的方法。。。我默默瞅了一眼我的小笔记本。。还有大佬直接贴了一篇论文From Word Embeddings To Document Distances（ICML-15）。。算了，看看最简单粗暴的能出个什么结果吧。


# load the trained word2vec model
import gensim.models

inpath = 'wordVectTrainResult'
model = gensim.models.Word2Vec.load(inpath)

# start with the IMDB data
import re
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier
import pyprind
import numpy as np
import matplotlib.pyplot as plt

stop = stopwords.words('english')
# BatchNum*BatchSize must smaller than 50000
BatchSize = 1000

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[4:-3], int(line[-2])
            text = re.sub('[\'\"\[\]\d\b]','',text)
            while text[0] == ',':
                    text = text[1:]
            yield text.split(', '), label


def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y


clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
ACC = []
 
classes = np.array([0, 1])
pbar = pyprind.ProgBar(21)

for BatchNum in range(25,46): 
    doc_stream = stream_docs(path='movie_data.csv') 
    for _ in range(BatchNum):
        X_train = []
        X_raw, y_train = get_minibatch(doc_stream, size=BatchSize)
        if not X_raw:
            break
        for line in X_raw:
            wordAveVec = np.zeros([100])
            abandon = 0
            try:
                for word in line:
                    wordAveVec = wordAveVec + model[word]
            except KeyError:
                abandon+=1
            wordAveVec = wordAveVec/(len(line) - abandon)
            X_train.append(wordAveVec)    
        clf.partial_fit(X_train, y_train, classes=classes)        
    
    X_raw_test, y_test = get_minibatch(doc_stream, size=(50000-BatchNum*BatchSize))
    X_test = []
    for line in X_raw_test:
            wordAveVec = np.zeros([100])
            abandon = 0
            try:
                for word in line:
                    wordAveVec = wordAveVec + model[word]
            except KeyError:
                abandon+=1
            wordAveVec = wordAveVec/(len(line) - abandon)
            X_test.append(wordAveVec)
    ACC.append(clf.score(X_test,y_test))
    pbar.update()
x = range(25,46)
plt.plot(x, ACC)
plt.xlabel('BatchNum')
plt.ylabel('Accuracy')
plt.grid()
plt.show()

因为在前几次测试的时候发现训练样本和测试样本的比值对最后测试准确度影响很大。所以就做了个50%-50%到90%-10%的遍历，看看比值对最终结果的影响。

这里写图片描述

(⊙ω⊙)！
好像有那么点意思

虽然抖地比较厉害，但总体趋势向上，最后差不多到 75%。

个人感觉因为训练词典是以整个50000个样本来训练的，在训练分类器的时候，和训练样本太少基本等于瞎猜，训练样本数越接近50000，准确率越高

但是比起不用word2vec直接上 bag of words 的SGD方法（87%），差距还是挺明显的。产生差距的原因应该还是用了均值向量来表示一整个文档的特征。

如果结合word2vec和bag of words应该能够有更好的结果，有空再补。