Sentiment Analysis on Movie Reviews

Sentiment Analysis on Movie Reviews

Classify the sentiment of sentences from the Rotten Tomatoes dataset

比赛地址

https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

Data Description

The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset. The train/test split has been preserved for the purposes of benchmarking, but the sentences have been shuffled from their original order. Each Sentence has been parsed into many phrases by the Stanford parser. Each phrase has a PhraseId. Each sentence has a SentenceId. Phrases that are repeated (such as short/common words) are only included once in the data.

  • train.tsv contains the phrases and their associated sentiment labels. We have additionally provided a SentenceId so that you can track which phrases belong to a single sentence.
  • test.tsv contains just phrases. You must assign a sentiment label to each phrase.

The sentiment labels are:
0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive

image.png

编写代码

# 导入torchtext相关包
from torchtext import data
from torchtext.vocab import Vectors
from tqdm import tqdm

import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

# pandas读入数据
raw_train_data = pd.read_csv('datasets/train.tsv', sep='\t', usecols=['PhraseId', 'Phrase', 'Sentiment'])
# 2006 行有缺失值
train_data = raw_train_data.sample(frac=0.8, random_state=0, axis=0)
val_data = raw_train_data[~raw_train_data.index.isin(train_data.index)]
# 157451 行有缺失值
test_data = pd.read_csv('datasets/test.tsv', sep='\t', usecols=['PhraseId', 'Phrase'])
# 构造Field对象
x_tokenize = lambda x: x.split()
y_tokenize = lambda y: int(y)

PID = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, tokenize=x_tokenize, lower=True, include_lengths=True)
LABEL = data.Field(sequential=False, use_vocab=False)

# get_dataset构造并返回Dataset所需examples和fields
def get_dataset(sample_data, id_field, text_field, label_field, test=False):
    fields = [('PhraseId', id_field), ('Phrase', text_field), ('Sentiment', label_field)]
    examples = []

    if test:
        for pid, text in tqdm(zip(sample_data['PhraseId'], sample_data['Phrase'])):
            examples.append(data.Example.fromlist([pid, text, None], fields))
    else:
        for pid, text, label in tqdm(zip(sample_data['PhraseId'], sample_data['Phrase'], sample_data['Sentiment'])):
            examples.append(data.Example.fromlist([pid, text, label], fields))
    return examples, fields

# 得到构建Dataset所需的examples和fields
train_examples, train_fields = get_dataset(train_data, PID, TEXT, LABEL)
val_examples, val_fields = get_dataset(val_data, PID, TEXT, LABEL)
test_examples, test_fields = get_dataset(test_data, PID, TEXT, None, test=True)

# 构建Dataset数据集
train = data.Dataset(train_examples, train_fields)
val = data.Dataset(val_examples, val_fields)
test = data.Dataset(test_examples, test_fields)

# 构建词表
# TEXT.build_vocab(train, vectors="glove.6B.50d")
# 通过cache参数指定缓存目录
cache = '.vector_cache'
vectors = Vectors(name='.vector_cache/glove.6B.50d.txt', cache=cache)
TEXT.build_vocab(train, vectors=vectors)

# 构建迭代器
# 若只对训练集构造迭代器
# train_iter = data.BucketIterator(dataset=train, batch_size=8, shuffle=True, sort_within_batch=False, repeat=False)
# 若同时对训练集和验证集进行迭代器构建
train_iter, val_iter = data.BucketIterator.splits(
                    (train, val),
                    batch_sizes = (64, 64),
                    device = -1, # 如果使用gpu,将-1更换为GPU的编号
                    sort_key = lambda x: len(x.Phrase),
                    sort_within_batch=True,
                    repeat=False 
                )
test_iter = data.Iterator(test, batch_size=8, device=-1, sort_key = lambda x: len(x.Phrase), sort_within_batch=True, repeat=False)

# 构建模型
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), embedding_dim)
        # nn.init.xavier_uniform_(self.embedding.weight) # 初始化权重
        self.embedding.weight.data.copy_(TEXT.vocab.vectors) # 载入预训练词向量

        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=False)
        # self.fc = nn.Linear(hidden_dim * num_layers * 2, 5) # 双向时、多层时
        self.fc = nn.Linear(hidden_dim * num_layers * 1, 5) # 单向时
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedding = self.relu(self.embedding(x[0]))
        packed_embedding = nn.utils.rnn.pack_padded_sequence(embedding, x[1])
        output, (hidden, cell) = self.lstm(packed_embedding)
        hidden = hidden.view(hidden.size()[1], -1)
        # hidden = torch.cat([hidden[-4], hidden[-3], hidden[-2], hidden[-1]], dim=1) # 双向时、多层时
        out = self.fc(self.relu(hidden))
        return out

lstm = LSTM(len(TEXT.vocab), 50, 64, 1)
# 优化器
optimizer = optim.Adam(lstm.parameters(), lr=1e-3)
# 交叉熵损失
loss_fn = nn.CrossEntropyLoss()

epochs = 10

def binary_acc(preds, y):
    """
    get accuracy
    """
    preds = torch.argmax(preds, dim=1)
    correct = torch.eq(preds, y).float()
    acc = correct.sum()
    return acc

for epoch in range(epochs):
    train_acc = 0
    train_loss = 0
    for i, batch in enumerate(train_iter):
        preds = lstm(batch.Phrase)
        loss = loss_fn(preds, batch.Sentiment)
        
        train_acc += binary_acc(preds, batch.Sentiment).item()
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 验证集    
    val_acc = 0
    val_loss = 0
    for batch in val_iter:
        preds = lstm(batch.Phrase)
        loss = loss_fn(preds, batch.Sentiment)
        
        val_acc += binary_acc(preds, batch.Sentiment).item()
        val_loss += loss.item()
    print('Epoch: {}/{} - \tLoss: {:.6f} - \tAcc: {:.6f} - \tVal_Loss: {:.6f} - \tVal_Acc: {:.6f}'.format(epoch, epochs, 
        train_loss / len(train_iter.dataset), train_acc / len(train_iter.dataset),
        val_loss / len(val_iter.dataset), val_acc / len(val_iter.dataset)))
    # 保存检查点
    torch.save(lstm, 'models/epoch{}_checkpoint.pkl'.format(epoch))


print('train completed')

def saveAndPred():
    # 加载已保存模型
    lstm = torch.load('models/epoch8_checkpoint.pkl')

    # 创建csv文件
    dataframe = pd.DataFrame(columns=['PhraseId', 'Sentiment'])
    dataframe.to_csv('datasets/sampleSubmission.csv', index=False)
    # 预测
    for batch in test_iter:
        preds = lstm(batch.Phrase)
        preds = torch.argmax(preds, dim=1)
        res = list(zip(batch.PhraseId.numpy(), preds.numpy()))

        df = pd.DataFrame(res)
        df.to_csv('datasets/sampleSubmission.csv', mode='a+', header=False, index=False)

    # 根据PhraseId对数据进行重排
    df = pd.read_csv('datasets/sampleSubmission.csv')
    df.sort_values(['PhraseId'], ascending=True, inplace=True)
    df.to_csv('datasets/sampleSubmission1.csv', index=False)

# saveAndPred()

My most recent submission

My most recent submission

Public Leaderboard

Public Leaderboard
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容