最近在一些文章中看到用LSTM预测时间序列,通过使用网络上的数据并且借鉴网上的代码,对LSTM模型的搭建有了一定的了解。本篇文章使用的数据数据来自用Python预测「周期性时间序列」的正确姿势。
在数据中,部分时间的数据值是缺失的,上方链接中的作者通过取平均值的方法填补了缺失数据,折线图如下:在折线图中,能看到在开头的数据和其它数据差异较大,由于数据是周期性的,于是对开头“不合群”的数据做处理,处理方式是使用后6个相同时间点的数据取平均值,代码如下:
def api_dataset():
with open('api_access_fix.csv',encoding = 'utf-8-sig') as f:
reader = csv.reader(f)
dataset = []
for item in reader:
try:
dataset.append([int(float(item[2]))])
except:
pass
for i in range(len(dataset)):
if dataset[i][0]<=500 and i < 1440:
dataset[i][0] = int(sum([dataset[i+x*1440][0] for x in range(1,7)])/6)
return np.array(dataset)
当然,本文所使用的处理方式还是比较草率的,未经过深入的调研与分析,处理后的折线图如下:数据归一化函数,便于模型学习:
# 归一化函数
def sc_fit_transform(nDlist):
# 将所有数据归一化为0-1的范围
sc = MinMaxScaler(feature_range=(0, 1))
dataset_transform = sc.fit_transform(X=nDlist)
# 归一化后的数据
return sc, np.array(dataset_transform)
准备训练数据,将前6天的数据作为训练,即 training_num = 8640:
# 需要之前60次的访问数据来预测下一次的数据,
timestep = 60
# 训练数据的大小
training_num = 8640
# 迭代训练10次
epoch = 10
# 每次取数据数量
batch_size = 100
###############################################################################
listDataset = api_dataset()
# print(listDataset.shape)
# 生成训练集访问数据集
xTrainDataset = listDataset[0:training_num]
# 每次的下次访问次数是训练结果
yTrainDataset = listDataset[1:training_num+1]
# 原始数据归一化
scTrainDataseX, xTrainDataset = sc_fit_transform(xTrainDataset)
scTrainDataseY, yTrainDataset = sc_fit_transform(yTrainDataset)
将数据处理为LSTM输入的形式:
# 生成lstm模型需要的训练集数据
xTrain = []
for i in range(timestep, training_num):
xTrain.append(xTrainDataset[i-timestep : i])
xTrain = np.array(xTrain)
# print(xTrain.shape)
yTrain = []
for i in range(timestep, training_num):
yTrain.append(yTrainDataset[i])
yTrain = np.array(yTrain)
# print(yTrain.shape)
构建网络:
# 构建网络,使用的是序贯模型
model = Sequential()
model.add(LSTM(units=128, input_shape=[xTrain.shape[1], 1]))
model.add(Dense(1))
# 进行配置
model.compile(optimizer='adam',
loss='mean_squared_error',
metrics=['accuracy'])
model.fit(x=xTrain, y=yTrain, epochs=epoch, batch_size=batch_size)
# 保存模型
model.save('my_model.h5')
准备测试数据:
# 此步骤与准备训练数据基本一致
xTestDataset = listDataset[training_num:10080-2]
scTesDatasetX, xTestDataset = sc_fit_transform(xTestDataset)
yTestDataset = listDataset[training_num+1:10080-1]
scTestDataseY, yTestDataset = sc_fit_transform(yTestDataset)
# 生成lstm模型需要的训练集数据和
xTest = []
for i in range(timestep, len(xTestDataset)):
xTest.append(xTestDataset[i-timestep : i])
xTest = np.array(xTest)
# print(xTest.shape)
yTest = []
for i in range(timestep, len(xTestDataset)):
yTest.append(yTestDataset[i])
# 反归一化
yTest = scTestDataseY.inverse_transform(X= yTest)
# print(yTest.shape)
# print(yTest)
进行预测:
# 进行预测
yPredictes = model.predict(x=xTest)
# 反归一化
yPredictes = scTestDataseY.inverse_transform(X=yPredictes)
# print(yPredictes.shape)
# print(yPredictes)
结果评价:
# 评估标准: mae, rmse, r2_score
mae = mean_absolute_error(yTest, yPredictes)
rmse = mean_squared_error(yTest, yPredictes, squared=False)
r2 = r2_score(yTest, yPredictes)
# print(mae, rmse, r2)
# 45.70792188492153 74.77525176850149 0.9880226807229917
完整代码如下:
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
def api_dataset():
with open('api_access_fix.csv',encoding = 'utf-8-sig') as f:
reader = csv.reader(f)
dataset = []
for item in reader:
try:
dataset.append([int(float(item[2]))])
except:
pass
for i in range(len(dataset)):
if dataset[i][0]<=500 and i < 1440:
dataset[i][0] = int(sum([dataset[i+x*1440][0] for x in range(1,7)])/6)
return np.array(dataset)
# 归一化函数
def sc_fit_transform(nDlist):
# 将所有数据归一化为0-1的范围
sc = MinMaxScaler(feature_range=(0, 1))
dataset_transform = sc.fit_transform(X=nDlist)
# 归一化后的数据
return sc, np.array(dataset_transform)
###############################################################################
# 需要之前60次的访问数据来预测下一次的数据,
timestep = 60
# 训练数据的大小
training_num = 8640
# 迭代训练10次
epoch = 10
# 每次取数据数量
batch_size = 100
###############################################################################
listDataset = api_dataset()
# print(listDataset.shape)
# 生成训练集访问数据集
xTrainDataset = listDataset[0:training_num]
# 每次的下次访问次数是训练结果
yTrainDataset = listDataset[1:training_num+1]
# 原始数据归一化
scTrainDataseX, xTrainDataset = sc_fit_transform(xTrainDataset)
scTrainDataseY, yTrainDataset = sc_fit_transform(yTrainDataset)
###############################################################################
# 生成lstm模型需要的训练集数据
xTrain = []
for i in range(timestep, training_num):
xTrain.append(xTrainDataset[i-timestep : i])
xTrain = np.array(xTrain)
# print(xTrain.shape)
yTrain = []
for i in range(timestep, training_num):
yTrain.append(yTrainDataset[i])
yTrain = np.array(yTrain)
# print(yTrain.shape)
###############################################################################
# 构建网络,使用的是序贯模型
model = Sequential()
#return_sequences=True返回的是全部输出,LSTM做第一层时,需要指定输入shape
model.add(LSTM(units=128, input_shape=[xTrain.shape[1], 1]))
model.add(Dense(1))
# 进行配置
model.compile(optimizer='adam',
loss='mean_squared_error',
metrics=['accuracy'])
model.fit(x=xTrain, y=yTrain, epochs=epoch, batch_size=batch_size)
model.save('my_model.h5')
###############################################################################
xTestDataset = listDataset[training_num:10080-2]
scTesDatasetX, xTestDataset = sc_fit_transform(xTestDataset)
yTestDataset = listDataset[training_num+1:10080-1]
scTestDataseY, yTestDataset = sc_fit_transform(yTestDataset)
# 生成lstm模型需要的训练集数据
xTest = []
for i in range(timestep, len(xTestDataset)):
xTest.append(xTestDataset[i-timestep : i])
xTest = np.array(xTest)
print(xTest.shape)
yTest = []
for i in range(timestep, len(xTestDataset)):
yTest.append(yTestDataset[i])
# 反归一化
yTest = scTestDataseY.inverse_transform(X= yTest)
print(yTest.shape)
print(yTest)
###############################################################################
# 进行预测
yPredictes = model.predict(x=xTest)
# 反归一化
yPredictes = scTestDataseY.inverse_transform(X=yPredictes)
print(yPredictes.shape)
print(yPredictes)
###############################################################################
#对比结果,绘制数据图表,红色是真实数据,蓝色是预测数据
plt.plot(yTest, color='red', label='Real')
plt.plot(yPredictes, color='blue', label='Predict')
plt.title(label='Prediction')
plt.xlabel(xlabel='Time')
plt.ylabel(ylabel='Api_access_num')
plt.legend()
plt.show()
# 评估标准: mae, rmse, r2_score
mae = mean_absolute_error(yTest, yPredictes)
rmse = mean_squared_error(yTest, yPredictes, squared=False)
r2 = r2_score(yTest, yPredictes)
print(mae, rmse, r2)
# 72.02636248234026 98.38626354602893 0.9791679689516253
# 45.70792188492153 74.77525176850149 0.9880226807229917
测试集和预测数据折线图:
mae:45.70792188492153
rmse:74.77525176850149
r2:0.9880226807229917