在我们平常的机器学习得到的数据并不能直接使用,必须先对它进行预处理后才能正常使用。以股票数据为例,当前的股价并不是独立的,它与之前的股价是有关系的,一个可用的方法是对对股价处理转为增长幅度。
time1, price1
time2, price2
转为
time1, Nan
time2, (price2-price1)/price1
SKLear提供了Pipeline工具方便数据处理。
## 定义数据处理类ColumnExtractor,获取指定列的数据
class ColumnExtractor(BaseEstimator, TransformerMixin):
def __init__(self, column_name):
self.column_name = column_name
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.column_name]
## 定义数据处理类TimeSeriesDiff, 对数据作data.diff()/data.shift(1)处理
class TimeSeriesDiff(BaseEstimator, TransformerMixin):
def __init__(self, k=1):
self.k = k
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
if type(X) is pd.core.frame.DataFrame or type(X) is pd.core.series.Series:
return X.diff(self.k) / X.shift(self.k)
else:
raise Exception("Have to be a pandas data frame or Series object!")
## 定义数据处理类TimeSeriesEmbedder,每k个数据组成新的数据,共有N-k组数据
class TimeSeriesEmbedder(BaseEstimator, TransformerMixin):
def __init__(self, k):
self.k = k
def fit(self, X, y= None):
return self
def transform(self, X, y = None):
return embed_time_series(X, self.k)
def embed_time_series(x, k):
n = len(x)
if k >= n:
raise Exception("Can not deal with k greater than the length of x")
output_x = list(map(lambda i: list(x[i:(i+k)]),
range(0, n-k)))
return np.array(output_x)
使用PipeLine对数据进行预处理组成新的模型
## 对"Close"列数据进行处理
pipeline_closing_price = Pipeline([("ColumnEx", ColumnExtractor("Close")),
("Diff", TimeSeriesDiff()),
("Embed", TimeSeriesEmbedder(10)),
("ImputerNA", Imputer())])
## 对"Volume"列数据进行处理
pipeline_volume = Pipeline([("ColumnEx", ColumnExtractor("Volume")),
("Diff", TimeSeriesDiff()),
("Embed", TimeSeriesEmbedder(10)),
("ImputerNA", Imputer())])
## 联合将”Close"与"Volume"处理后的数据
merged_features = FeatureUnion([("ClosingPriceFeature", pipeline_closing_price),
("VolumeFeature", pipeline_volume)])
## 将数据中添加多项式特征
pipeline_2 = Pipeline([("MergedFeatures", merged_features),
("PolyFeature",PolynomialFeatures()),
("LinReg", LinearRegression())])
使用新模型进行数据训练与预测
pipeline_2.fit(data_train, y_train)
y_pred_2 = pipeline_2.predict(data_test)
对预测数据进行打分
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_2)