打卡太突然了,直接上代码
# 导包
import time
import math
import sys
# sys.path.append("/home/kesci/input")
import re
import pandas as pd
import numpy as np
import collections
import os
import random
import time
from tqdm import tqdm
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
! pip install jieba -i https://pypi.douban.com/simple
! pip install -i https://pypi.tuna.tsinghua.edu.cn/simple gensim
import jieba
import gensim
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
# 读取数据
def read_data(path):
# 返回二维列表,axis=0是行(即有多少评论),axis=1是列(0是是否推荐,1是评论内容)
with open(path, 'r') as f:
lines = f.read()
comments = [line.split('\t') for line in lines.split('\n')] # 用split处理str返回列表
return comments[:-1] # 删除最后一个元素,貌似是\n
cmts = read_data('/home/kesci/input/Comments9120/train_shuffle.txt')
# 示例 cmts[300][1]='牛油果卷很清爽'
# len(cmts)=16000
for i in range(len(cmts)):
cmts[i].reverse()
# 形如[['酸菜鱼不错', '0'],
# ['轻食素食都是友善的饮食方式', '0'],……
# 调整cmts_hantout格式,使其与cmts一致,以便于共享函数
cmts_handout = read_data('/home/kesci/input/Comments9120/test_handout.txt')
for cmt_handout in cmts_handout:
cmt_handout.extend('0')
cmts_merge = cmts+cmts_handout
# 方法二:创建词索引
def get_tokenized_9120(data):
'''
@params:
data: 数据的列表,列表中的每个元素为 [文本字符串,0/1标签] 二元组
@return: 切分词后的文本的列表,列表中的每个元素为切分后的词序列
'''
def tokenizer(text):
return jieba.lcut(text) # 对于英文这样分[tok.lower() for tok in text.split(' ')]
return [tokenizer(review) for review, _ in data]
def get_vocab_9120(data):
'''
@params:
data: 同上
@return: 数据集上的词典,Vocab 的实例(freqs, stoi, itos)
'''
tokenized_data = get_tokenized_9120(data)
counter = collections.Counter([tk for st in tokenized_data for tk in st])
return Vocab.Vocab(counter, min_freq=1) # 原来min_freq是5,改成1看看
vocab = get_vocab_9120(cmts_merge)
print('# words in vocab:', len(vocab))
# 词典和词语的索引创建好后,就可以将数据集的文本从字符串的形式转换为单词下标序列的形式,以待之后的使用。
def preprocess_9120(data, vocab):
'''
@params:
data: 同上,原始的读入数据
vocab: 训练集上生成的词典
@return:
features: 单词下标序列,形状为 (n, max_l) 的整数张量
labels: 情感标签,形状为 (n,) 的0/1整数张量
'''
max_l = 10 # 将每条评论通过截断或者补0,使得长度变成10,根据实际需求后续可以减少
def pad(x):
return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))
tokenized_data = get_tokenized_9120(data)
features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
labels = torch.tensor([int(score) for _, score in data]) # 文本里01是str,改为int
return features, labels
# features_tmp, labels_tmp = preprocess_9120(cmts, vocab)
# print(features_tmp)
# print(labels_tmp)
# # tensor([[ 243, 4, 0, ..., 0, 0, 0],
# # [2668, 1450, 9, ..., 1899, 0, 0],
# # [2302, 1048, 18, ..., 0, 0, 0],
# # ...,
# # [ 295, 15, 4, ..., 0, 0, 0],
# # [ 16, 67, 36, ..., 0, 0, 0],
# # [8946, 193, 7, ..., 0, 0, 0]])
# # tensor([0, 0, 0, ..., 0, 1, 0])
# 分类train和test数据集
k = 100
test_data = cmts[:len(cmts)//k] # 前1/k是test数据
train_data = cmts[len(cmts)//k:] # 后(k-1)/k是train数据
# # 互换
# train_data = cmts[:len(cmts)//k] # 前1/k是test数据
# test_data = cmts[len(cmts)//k:] # 后(k-1)/k是train数据
# 创建数据迭代器
train_set = Data.TensorDataset(*preprocess_9120(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_9120(test_data, vocab))
# 上面的代码等价于下面的注释代码
# train_features, train_labels = preprocess_imdb(train_data, vocab)
# test_features, test_labels = preprocess_imdb(test_data, vocab)
# train_set = Data.TensorDataset(train_features, train_labels)
# test_set = Data.TensorDataset(test_features, test_labels)
# len(train_set) = features.shape[0] or labels.shape[0]
# train_set[index] = (features[index], labels[index])
batch_size = 64
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)
for X, y in train_iter:
print('X', X.shape, 'y', y.shape)
break
print('#batches:', len(train_iter))
class BiRNN(nn.Module):
def __init__(self, vocab, embed_size, num_hiddens, num_layers):
'''
@params:
vocab: 在数据集上创建的词典,用于获取词典大小
embed_size: 嵌入维度大小
num_hiddens: 隐藏状态维度大小
num_layers: 隐藏层个数
'''
super(BiRNN, self).__init__()
self.embedding = nn.Embedding(len(vocab), embed_size)
# encoder-decoder framework
# bidirectional设为True即得到双向循环神经网络
self.encoder = nn.LSTM(input_size=embed_size,
hidden_size=num_hiddens,
num_layers=num_layers,
bidirectional=True)
self.decoder = nn.Linear(4*num_hiddens, 2) # 初始时间步和最终时间步的隐藏状态作为全连接层输入
def forward(self, inputs):
'''
@params:
inputs: 词语下标序列,形状为 (batch_size, seq_len) 的整数张量
@return:
outs: 对文本情感的预测,形状为 (batch_size, 2) 的张量
'''
# 因为LSTM需要将序列长度(seq_len)作为第一维,所以需要将输入转置
embeddings = self.embedding(inputs.permute(1, 0)) # (seq_len, batch_size, d)
# rnn.LSTM 返回输出、隐藏状态和记忆单元,格式如 outputs, (h, c)
outputs, _ = self.encoder(embeddings) # (seq_len, batch_size, 2*h)
encoding = torch.cat((outputs[0], outputs[-1]), -1) # (batch_size, 4*h)
outs = self.decoder(encoding) # (batch_size, 2)
return outs
embed_size, num_hiddens, num_layers = 300, 150, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
# 加载预训练的词向量
# # 预训练文件太大,本地来加载
# model = KeyedVectors.load_word2vec_format(datapath(r'E:\Computer Science\9.deep learning\Project\NLP\pretrained_corpus\sgns.weibo.bigram-char'), binary=False)
# def load_pretrained_embedding(words, pretrained_vocab):
# '''
# @params:
# words: 需要加载词向量的词语列表,以 itos (index to string) 的词典形式给出
# pretrained_vocab: 预训练词向量
# @return:
# embed: 加载到的词向量
# '''
# embed = torch.zeros(len(words), pretrained_vocab.vector_size) # 初始化为0
# oov_count = 0 # out of vocabulary
# for i, word in enumerate(words):
# try:
# embed[i, :] = torch.from_numpy(pretrained_vocab[word])
# except KeyError:
# oov_count += 1
# if oov_count > 0:
# print("There are %d oov words." % oov_count)
# return embed
# net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, model))
# net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它
# # pickle保存变量
# import pickle
# f = open('data.pkl','wb')
# pickle.dump(net.embedding.weight.data,f)
# f.close()
# pickle读取变量
import pickle
f = open('/home/kesci/work/merge_train_test.pkl','rb')
read_data = pickle.load(f)
f.close()
net.embedding.weight.data.copy_(read_data)
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它
# 训练模型
def evaluate_accuracy(data_iter, net, device=device):
if device is None and isinstance(net, torch.nn.Module):
device = list(net.parameters())[0].device
acc_sum, n = 0.0, 0
with torch.no_grad():
for X, y in data_iter:
if isinstance(net, torch.nn.Module):
net.eval()
acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
net.train()
else:
if('is_training' in net.__code__.co_varnames):
acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item()
else:
acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
n += y.shape[0]
return acc_sum / n
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
net = net.to(device)
print("training on ", device)
batch_count = 0
for epoch in range(num_epochs):
train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
for X, y in train_iter:
X = X.to(device)
y = y.to(device)
y_hat = net(X)
l = loss(y_hat, y)
optimizer.zero_grad()
l.backward()
optimizer.step()
train_l_sum += l.cpu().item()
train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
n += y.shape[0]
batch_count += 1
test_acc = evaluate_accuracy(test_iter, net)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
% (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))
lr, num_epochs = 0.01, 15
optimizer = torch.optim.Adam(
filter(
lambda p: p.requires_grad,
net.parameters()),
lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)
# 评价模型
def predict_sentiment(net, vocab, sentences):
'''
@params:
net: 训练好的模型
vocab: 在该数据集上创建的词典,用于将给定的单词序转换为单词下标的序列,从而输入模型
sentence: 需要分析情感的文本,以单词序列的形式给出
@return: 预测的结果,positive 为正面情绪文本,negative 为负面情绪文本
'''
predict_result = []
for sentence in sentences:
# print(sentence)
device = list(net.parameters())[0].device # 读取模型所在的环境
sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
t = net(sentence.view((1, -1)))
m = nn.Softmax(dim=1)
hat = m(t)[0][1]
hat = hat.detach().cpu().numpy()
hat.astype(np.float16)
predict_result = np.append(predict_result, hat)
# print(hat)
# print('\n')
# label = torch.argmax(net(sentence.view((1, -1))), dim=1)
# print(label)
# print(net(sentence.view((1, -1))))
# return 'positive' if label.item() == 1 else 'negative'
return predict_result
# predict_sentiment(net, vocab, [['饭', '哈哈香'],['饭', '香']])
# vocab_handout = get_vocab_9120(cmts_handout)
tokenized_data_handout = get_tokenized_9120(cmts_handout)
result = pd.DataFrame(predict_sentiment(net, vocab, tokenized_data_handout))
result.to_csv("./output_v299.csv")