Sentiment Analysis on Movie Reviews
Classify the sentiment of sentences from the Rotten Tomatoes dataset
比赛地址
https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews
Data Description
The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset. The train/test split has been preserved for the purposes of benchmarking, but the sentences have been shuffled from their original order. Each Sentence has been parsed into many phrases by the Stanford parser. Each phrase has a PhraseId. Each sentence has a SentenceId. Phrases that are repeated (such as short/common words) are only included once in the data.
- train.tsv contains the phrases and their associated sentiment labels. We have additionally provided a SentenceId so that you can track which phrases belong to a single sentence.
- test.tsv contains just phrases. You must assign a sentiment label to each phrase.
The sentiment labels are:
0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive
image.png
编写代码
# 导入torchtext相关包
from torchtext import data
from torchtext.vocab import Vectors
from tqdm import tqdm
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
# pandas读入数据
raw_train_data = pd.read_csv('datasets/train.tsv', sep='\t', usecols=['PhraseId', 'Phrase', 'Sentiment'])
# 2006 行有缺失值
train_data = raw_train_data.sample(frac=0.8, random_state=0, axis=0)
val_data = raw_train_data[~raw_train_data.index.isin(train_data.index)]
# 157451 行有缺失值
test_data = pd.read_csv('datasets/test.tsv', sep='\t', usecols=['PhraseId', 'Phrase'])
# 构造Field对象
x_tokenize = lambda x: x.split()
y_tokenize = lambda y: int(y)
PID = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, tokenize=x_tokenize, lower=True, include_lengths=True)
LABEL = data.Field(sequential=False, use_vocab=False)
# get_dataset构造并返回Dataset所需examples和fields
def get_dataset(sample_data, id_field, text_field, label_field, test=False):
fields = [('PhraseId', id_field), ('Phrase', text_field), ('Sentiment', label_field)]
examples = []
if test:
for pid, text in tqdm(zip(sample_data['PhraseId'], sample_data['Phrase'])):
examples.append(data.Example.fromlist([pid, text, None], fields))
else:
for pid, text, label in tqdm(zip(sample_data['PhraseId'], sample_data['Phrase'], sample_data['Sentiment'])):
examples.append(data.Example.fromlist([pid, text, label], fields))
return examples, fields
# 得到构建Dataset所需的examples和fields
train_examples, train_fields = get_dataset(train_data, PID, TEXT, LABEL)
val_examples, val_fields = get_dataset(val_data, PID, TEXT, LABEL)
test_examples, test_fields = get_dataset(test_data, PID, TEXT, None, test=True)
# 构建Dataset数据集
train = data.Dataset(train_examples, train_fields)
val = data.Dataset(val_examples, val_fields)
test = data.Dataset(test_examples, test_fields)
# 构建词表
# TEXT.build_vocab(train, vectors="glove.6B.50d")
# 通过cache参数指定缓存目录
cache = '.vector_cache'
vectors = Vectors(name='.vector_cache/glove.6B.50d.txt', cache=cache)
TEXT.build_vocab(train, vectors=vectors)
# 构建迭代器
# 若只对训练集构造迭代器
# train_iter = data.BucketIterator(dataset=train, batch_size=8, shuffle=True, sort_within_batch=False, repeat=False)
# 若同时对训练集和验证集进行迭代器构建
train_iter, val_iter = data.BucketIterator.splits(
(train, val),
batch_sizes = (64, 64),
device = -1, # 如果使用gpu,将-1更换为GPU的编号
sort_key = lambda x: len(x.Phrase),
sort_within_batch=True,
repeat=False
)
test_iter = data.Iterator(test, batch_size=8, device=-1, sort_key = lambda x: len(x.Phrase), sort_within_batch=True, repeat=False)
# 构建模型
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
super(LSTM, self).__init__()
self.embedding = nn.Embedding(len(TEXT.vocab), embedding_dim)
# nn.init.xavier_uniform_(self.embedding.weight) # 初始化权重
self.embedding.weight.data.copy_(TEXT.vocab.vectors) # 载入预训练词向量
self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=False)
# self.fc = nn.Linear(hidden_dim * num_layers * 2, 5) # 双向时、多层时
self.fc = nn.Linear(hidden_dim * num_layers * 1, 5) # 单向时
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, x):
embedding = self.relu(self.embedding(x[0]))
packed_embedding = nn.utils.rnn.pack_padded_sequence(embedding, x[1])
output, (hidden, cell) = self.lstm(packed_embedding)
hidden = hidden.view(hidden.size()[1], -1)
# hidden = torch.cat([hidden[-4], hidden[-3], hidden[-2], hidden[-1]], dim=1) # 双向时、多层时
out = self.fc(self.relu(hidden))
return out
lstm = LSTM(len(TEXT.vocab), 50, 64, 1)
# 优化器
optimizer = optim.Adam(lstm.parameters(), lr=1e-3)
# 交叉熵损失
loss_fn = nn.CrossEntropyLoss()
epochs = 10
def binary_acc(preds, y):
"""
get accuracy
"""
preds = torch.argmax(preds, dim=1)
correct = torch.eq(preds, y).float()
acc = correct.sum()
return acc
for epoch in range(epochs):
train_acc = 0
train_loss = 0
for i, batch in enumerate(train_iter):
preds = lstm(batch.Phrase)
loss = loss_fn(preds, batch.Sentiment)
train_acc += binary_acc(preds, batch.Sentiment).item()
train_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 验证集
val_acc = 0
val_loss = 0
for batch in val_iter:
preds = lstm(batch.Phrase)
loss = loss_fn(preds, batch.Sentiment)
val_acc += binary_acc(preds, batch.Sentiment).item()
val_loss += loss.item()
print('Epoch: {}/{} - \tLoss: {:.6f} - \tAcc: {:.6f} - \tVal_Loss: {:.6f} - \tVal_Acc: {:.6f}'.format(epoch, epochs,
train_loss / len(train_iter.dataset), train_acc / len(train_iter.dataset),
val_loss / len(val_iter.dataset), val_acc / len(val_iter.dataset)))
# 保存检查点
torch.save(lstm, 'models/epoch{}_checkpoint.pkl'.format(epoch))
print('train completed')
def saveAndPred():
# 加载已保存模型
lstm = torch.load('models/epoch8_checkpoint.pkl')
# 创建csv文件
dataframe = pd.DataFrame(columns=['PhraseId', 'Sentiment'])
dataframe.to_csv('datasets/sampleSubmission.csv', index=False)
# 预测
for batch in test_iter:
preds = lstm(batch.Phrase)
preds = torch.argmax(preds, dim=1)
res = list(zip(batch.PhraseId.numpy(), preds.numpy()))
df = pd.DataFrame(res)
df.to_csv('datasets/sampleSubmission.csv', mode='a+', header=False, index=False)
# 根据PhraseId对数据进行重排
df = pd.read_csv('datasets/sampleSubmission.csv')
df.sort_values(['PhraseId'], ascending=True, inplace=True)
df.to_csv('datasets/sampleSubmission1.csv', index=False)
# saveAndPred()
My most recent submission
My most recent submission
Public Leaderboard
Public Leaderboard