1 RNN的缺点

上文我们已经介绍了经典的RNN。这篇文章主要聚焦在RNN的改良版：LSTM。关于RNN的，教科书上说RNN容易造成梯度消失或梯度爆炸，但是为什么呢？如果有一个数学推导来演示一下我想各位一定会很容易明白。

https://blog.csdn.net/mary19831/article/details/129570030

使用这个公式就可以清楚的知道，为什么会容易梯度消失和梯度爆炸：

image.png

2 LSTM

那么什么是LSTM呢？
可以直接看图，一目了然：

LSTM

与RNN多了很多门控机制，其中细胞状态cell state的更新我认为比较重要：

Ct_condidate✖️input_matrix + C(t-1)✖️forget_matrix
由这个公式来决定旧记忆和新记忆的保留比例。

但是至于为什么LSTM不会造成梯度消失和梯度爆炸，由于博主数学基础不好，暂且引用上面文章中给出的解释：

image.png

我也自己推导了一下子，写的没有原博主好看：

image.png

3 LSTM demo

用LSTM来解决跟RNN相同的问题，看看LSTM跟RNN相比怎么样

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt


x = np.linspace(0, 30*3.14, 2000)
y = np.sin(x)

plt.figure(figsize=(10, 4))
plt.plot(x, y)
plt.title('Sine Wave')
plt.xlabel('X')
plt.ylabel('sin(X)')
plt.grid()
plt.show()

def create_sequences(data, seq_length):
    xs = []
    ys = []
    for i in range(len(data) - seq_length):
        # 即将sin(x1)-sin(xn)作为输入数据，sin(xn+1)作为标签
        x_seq = data[i:i+seq_length]
        y_next = data[i+seq_length]
        xs.append(x_seq)
        ys.append(y_next)
    return np.array(xs), np.array(ys)


seq_length=20
X, y = create_sequences(y, seq_length=20)

X.shape, y.shape

# X的形状变为：[980, 20, 1]即 (batch_size, seq_long, input_size)
X = torch.tensor(X, dtype=torch.float32).unsqueeze(2)
y = torch.tensor(y, dtype=torch.float32)



train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

X_train.shape, y_train.shape, X_test.shape, y_test.shape
# (torch.Size([784, 20, 1]), torch.Size([784]), torch.Size([196, 20, 1]), torch.Size([196]))




class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=1)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # 在LSTM中out的形状跟经典的RNN是差不多的
        # out：所有时间步的隐藏状态形状：(batch_size, seq_len, hidden_size) 
        # 不过在LSTM中_返回的就比较复杂了哦：
        # _ 是一个元组**：(h_n, c_n)，分别代表：
        # h_n：最后一个时间步的隐藏状态。形状：(num_layers, batch_size, hidden_size) = (1, 2, 5)
        # c_n：最后一个时间步的细胞状态。形状：(1, 2, 5)（与 h_n 相同）。
        # 注意：c_n 是LSTM独有的，RNN没有。

        out, _ = self.lstm(x) 
        # predicted的形状为(batch_size, output_size)
        prediced = self.fc(out[:, -1, :])  # 只取最后一个时间步
        return prediced



input_size = 1
hidden_size = 64
output_size = 1
model = SimpleLSTM(input_size, hidden_size, output_size)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

num_epochs = 200


for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        # predictions的形状是(batch_size, output_size)
        # 即通过学习sin(x1)-sin(x5)预测sin(x6)
        # 需要注意的是模型的输入是三维数据，(batch_size, seq_long, input_size)
        predictions = model(batch_X)
        loss = criterion(predictions.squeeze(), batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss = total_loss + loss.item()

    if (epoch + 1) % 20 == 0:
        model.eval()
        with torch.no_grad():
            test_predictions = model(X_test)
            test_loss = criterion(test_predictions.squeeze(), y_test)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.8f}, Test Loss: {test_loss.item():.8f}')


    
model.eval()
with torch.no_grad():
    train_predictions = model(X_train)
    test_predictions = model(X_test)

# 可视化
plt.figure(figsize=(12, 6))
plt.plot(range(len(y_train)), y_train.numpy(), label='Train True', color='blue')
plt.plot(range(len(y_train)), train_predictions.numpy(), label='Train Predicted', color='orange')
plt.plot(range(len(y_train), len(y_train) + len(y_test)), y_test.numpy(), label='Test True', color='green')
plt.plot(range(len(y_train), len(y_train) + len(y_test)), test_predictions.numpy(), label='Test Predicted', color='red')
plt.xlabel('Time Step')
plt.ylabel('Value')
plt.title('RNN Time Series Prediction')
plt.legend()
plt.show()


def predict_future(model, initial_sequence, steps):
    model.eval()
    predictions = []
    
    # 确保初始序列形状正确 [seq_length, 1]
    if initial_sequence.dim() == 1:
        current_seq = initial_sequence.unsqueeze(1)  # [seq_length, 1]
    else:
        current_seq = initial_sequence.clone()
    
    with torch.no_grad():
        for _ in range(steps):
            # 添加batch维度 [1, seq_length, 1]
            input_seq = current_seq.unsqueeze(0)
            
            # 预测下一个值
            next_pred = model(input_seq)  # 形状 [1, 1]
            
            predictions.append(next_pred.item())
            
            # 更新序列：去掉第一个值，添加新预测值
            # 保持形状为 [seq_length, 1]
            next_pred_reshaped = next_pred.view(1, 1)  # 确保形状是 [1, 1]
            # current_seq的形状是 [20, 1]
            # torch.cat在指定维度拼接张量，除了拼接维度外，其他维度必须相同，dim=0表示沿着什么什么方向
            # 这一步是将新预测的值添加到序列的末尾，同时去掉序列的第一个值
            current_seq = torch.cat([current_seq[1:], next_pred_reshaped], dim=0)
    
    return predictions


# X[0] 是第一个样本，X[1] 是第二个样本，...，X[979] 是第980个样本。
# X[-1] 等价于 X[979]（最后一个样本），X[-2] 是倒数第二个样本，依此类推。


# 7. 使用最后20个点进行未来预测
initial_seq = X[-1].squeeze()  # 从 [20, 1] 变为 [20]
future_steps = 4000
future_preds = predict_future(model, initial_seq, future_steps)

# 8. 可视化结果
plt.figure(figsize=(12, 6))
plt.plot(y.numpy(), label='Original Data')
plt.plot(range(len(y), len(y)+future_steps), future_preds, 'r-', label='Future Predictions')
plt.legend()
plt.title('RNN Multi-step Prediction of sin(x)')
plt.show()

TRUE VS PREDICTION

future predict

比较惊讶的是，LSTM的多步预测并没有逐渐趋于0，只不过最大值和最小值变成了+2和-1.5。但在几百步之内预测还是比较准确的。

4 GRU

GRU与LSTM相比，更轻量化一点，门控机制更简洁一点，只有两个门控：重置门和更新门,引用也比较简单。

nn.GRU(input_size, hidden_size, batch_first=True, num_layers=1)

GRU图示：

GRU structure

韩遇安浅论LSTM

韩遇安浅论LSTM

1 RNN的缺点

2 LSTM

3 LSTM demo

4 GRU

推荐阅读更多精彩内容