本主题主要是lightGBM的入门与调优思路介绍;包含如下几个内容:
1. lightGBM的原生分类实现
2. lightGBM的sklearn分类实现
3. lightGBM的原生回归实现
4. lightGBM的sklearn回归实现
5. lightGBM的主要参数说明,以及参数的调优思路入门;注意:
1.lightGBM本身有交叉验证训练以及网格搜索的优化训练,这里不在解释范围内。
2.数据集采用的是sklearn提供的iris数据集与SP500数据集。
一. lightGBM的原生分类例子
说明
- 简单使用,了解编程模式,然后了解更多的参数使用,并得到效果更好的参数配置。
使用步骤
1. 准备数据
import sklearn.datasets as ds
data, target = ds.load_iris(return_X_y=True)
data.shape,target.shape
((150, 4), (150,))
data = data [0:100, :]
target = target[0:100]
data.shape,target.shape
((100, 4), (100,))
2. 训练集与测试集切分
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
X_train.shape, X_test.shape, y_train.shape,y_test.shape
((80, 4), (20, 4), (80,), (20,))
3. lightGBM的基本使用模式
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train,label = y_train)
# 训练
gbm = lgb.train({},train_set=lgb_train)
# 预测数据集
y_pred = gbm.predict(X_test)
# 评估模型
print(y_pred)
[1.22846467e-05 1.22846467e-05 1.22846467e-05 9.99985723e-01
9.99985723e-01 9.99985723e-01 1.22846467e-05 1.22846467e-05
9.99985723e-01 9.99985723e-01 9.99985723e-01 9.99985723e-01
9.99985723e-01 9.99985723e-01 1.22846467e-05 9.99985723e-01
1.22846467e-05 9.99985723e-01 9.99985723e-01 9.99985723e-01]
4. 测试结果手工评估
y_test
array([0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])
y_pred[y_pred>0.7] = 1
y_pred[y_pred<0.3] = 0
y_pred
array([0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
1., 1., 1.])
import numpy as np
y_pred = y_pred.astype(np.int)
y_pred
array([0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])
corr = (y_pred == y_test).sum()
print(F'正确数:{corr}')
正确数:20
二. lightGBM的sklearn分类例子
说明
- lightGBM提供了sklearn的使用封装,具有sklearn使用经验的用户使用起来更加直观与简单。
使用步骤
1. 准备数据
import sklearn.datasets as ds
from sklearn.model_selection import train_test_split
data, target = ds.load_iris(return_X_y=True)
data = data [0:100, :]
target = target[0:100]
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
2. 训练
import lightgbm as lgb
classifier = lgb.LGBMClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
3. 手工评估
y_pred
array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1])
corr = (y_pred == y_test).sum()
print(F'正确数:{corr}')
正确数:20
三. lightGBM的原生回归实现
准备数据
- 数据使用的是标准普尔500指数的数据,一共500支股票与对应时刻的普尔指数。数据5分钟采集一次。
1. 读取数据
import pandas as pd
data = pd.read_csv("data_stocks.csv");
2. 可视化数据
%matplotlib inline
# import seaborn as sns;
import matplotlib.pyplot as plt
# -----seaborn-----
# 保证每个点是相同间隔,不使用DATE作为x轴,而是采用整数序列
# data['IDX'] = data.index
# data[0: 5]
# sns.set(context='paper', style='ticks')
# ax = sns.lineplot(x="IDX", y="SP500", data=data)
# ax.figure.set_size_inches(15, 4)
# ----------------
figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.set_xlim(0, data.shape[0])
ax.set_ylim(2250, 2550)
ax.plot('SP500', data=data, color=(1, 0, 0, 1))
plt.show()
指数曲线
3. 数据清洗
data.drop('DATE', axis=1, inplace=True)
4.训练集与测试集切分
- 因为时序关系,所以切分数据集没有随机切分,而是按照顺序且切分。
from sklearn.model_selection import train_test_split
# train, test = train_test_split(data, test_size=0.2, shuffle=False)
# ((33012, 502), (8254, 502))
train, test = data[: data.shape[0] * 4 // 5], data[data.shape[0] * 4 // 5: ]
5. 数据归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)
6. 特征与标签
- 使用sp500指数作为标签
- 使用500家上市公司的股价作为特征;
train_data = train[:, 1:]
train_label = train[:, 0]
test_data = test[:, 1:]
test_label = test[:, 0]
7. 归一化的数据可视化
%matplotlib inline
# import seaborn as sns;
import matplotlib.pyplot as plt
figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.set_xlim(0, data.shape[0])
ax.plot(range(len(train_label)), train_label, color=(1, 0, 0, 1)) # 训练集
ax.plot(range(len(train_label), len(train_label) + len(test_label)), test_label, color=(0, 0, 1, 1)) # 测试集
plt.show()
归一化数据可视化曲线
循环与测试
1. 训练
import lightgbm as lgb
lgb_train = lgb.Dataset(train_data,label = train_label)
# 训练
gbm = lgb.train({},train_set=lgb_train)
2. 预测
predict_train = gbm.predict(train_data)
predict_test = gbm.predict(test_data)
3. 可视化
%matplotlib inline
# import seaborn as sns;
import matplotlib.pyplot as plt
figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.set_xlim(0, data.shape[0])
# 原始数据
ax.plot(range(len(train_label)), train_label, color=(1, 0, 0, 1)) # 训练集
ax.plot(range(len(train_label), len(train_label) + len(test_label)), test_label, color=(0, 0, 1, 1)) # 测试集
# 预测数据
ax.plot(range(len(train_label)), predict_train, color=(1, 1, 0, 1)) # 训练集
ax.plot(range(len(train_label), len(train_label) + len(test_label)), predict_test, color=(0, 1, 1, 1)) # 测试集
plt.show()
训练集与测试集回归曲线
4. 训练集预测可视化
%matplotlib inline
# import seaborn as sns;
import matplotlib.pyplot as plt
figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.plot(predict_train, color=(1, 0, 0, 1))
ax.plot(train_label, color=(0, 0, 1, 1))
plt.show()
训练集独立可视化
5. 测试集预测可视化
%matplotlib inline
# import seaborn as sns;
import matplotlib.pyplot as plt
figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.plot(predict_test, color=(1, 0, 0, 1))
ax.plot(test_label, color=(0, 0, 1, 1))
plt.show()
测试集拟合曲线可视化
四. lightGBM的sklearn回归实现
- 步骤与上面类似,但如下核心代码有区别:
import lightgbm as lgb
regressor = lgb.LGBMRegressor()
regressor.fit(train_data, train_label)
-
拟合的曲线效果如下:
sklearn方式的拟合结果
五. lightGBM的主要参数说明
鸢尾花三类分类例子
import sklearn.datasets as ds
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np
data, target = ds.load_iris(return_X_y=True)
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
print(X_test.shape)
lgb_train = lgb.Dataset(X_train,label = y_train)
# 训练
gbm = lgb.train({},train_set=lgb_train)
# 预测数据集
y_pred = gbm.predict(X_test)
# 评估模型
print(y_pred)
print(y_test)
(30, 4)
[ 2.94937766e-03 2.04888134e+00 1.08704226e+00 2.04229695e+00
1.41606515e+00 2.04163640e+00 1.92434531e+00 1.29981967e+00
-6.66079203e-03 -1.56922140e-03 1.95794545e+00 -5.36818708e-03
2.94937766e-03 2.94937766e-03 7.28622873e-05 7.28622873e-05
1.02877711e+00 1.04163151e+00 1.98902325e+00 1.05445874e+00
1.19528594e+00 2.00206072e+00 1.08159577e+00 6.82782116e-04
2.94937766e-03 2.94937766e-03 1.76885071e+00 1.54081382e+00
1.02567221e+00 1.20616748e+00]
[0 2 1 2 2 2 2 1 0 0 2 0 0 0 0 0 1 1 2 1 1 2 1 0 0 0 1 2 1 1]
- 下面数据都是来自官方文档
lightGBM的特征
这部分内容在官网的【Features】介绍
-
lightGBM的特征体现在优化,主要的优化体现在如下几个方面:
- 速度与内存优化【Optimization in Speed and Memory Usage】
- 稀疏优化【Sparse Optimization】
- 准确度【Optimization in Accuracy】
- 网络通信优化【Optimization in Network Communication】
- 并行计算优化【Optimization in Parallel Learning】
- 支持GPU运算【GPU Support】
- 支持多种应用方式与评估度量方式【Applications and Metrics】
- 其他特征【Other Features】
- Limit max_depth of tree while grows tree leaf-wise
- DART
- L1/L2 regularization
- Bagging
- Column (feature) sub-sample
- Continued train with input GBDT model
- Continued train with the input score file
- Weighted training
- Validation metric output during training
- Multiple validation data
- Multiple metrics
- Early stopping (both training and prediction)
- Prediction for leaf index
这部分的特征有点,具体体现在下面的参数的使用上。
lightGBM的应用实验
- 根究lightGBM提供的实验例子有5个,在官网也有连接下载。
数据 | 任务 | 连接 | #训练集总数 | #特征数 | 说明 |
---|---|---|---|---|---|
Higgs | Binary classification | 连接 | 10,500,000 | 28 | last 500,000 samples were used as test set |
Yahoo LTR | Learning to rank | 连接 | 473,134 | 700 | set1.train as train, set1.test as test |
MS LTR | Learning to rank | 连接 | 2,270,296 | 137 | {S1,S2,S3} as train set, {S5} as test set |
Expo | Binary classification | 连接 | 11,000,000 | 700 | last 1,000,000 samples were used as test set |
Allstate | Binary classification | 连接 | 13,184,290 | 4228 | last 1,000,000 samples were used as test set |
- 上述实验的比对结果,说明lightGBM的性能优化是明显的。具体数据参考官网:
lightGBM的参数
-
lightGBM的参数根据作用与功能分成如下几类:
-
参数太多,需要使用的时候,阅读官方文档,并且需要调优选择最适合不同数据集的参数。但是:
- 常用参数的使用需要了解;
- 调优的套路需要了解;
1. 参数格式
- 参数格式是一个字典
- { key1: value1, key2: value2, ......}
2. 常用参数-核心参数
-
task参数
- 缺省值 = train
- 参数类型 = enum
- 值列表:
- train/training:训练
- predict/prediction/test:预测
- convert_model:模型存储格式转换
- refit/refit_tree:使用新数据重新训练已经训练的模型;
- 参数别名: task_type
-
objective参数
- 缺省值:regression
- 类型: enum
- 参数别名:objective_type/app/application
- 值列表:
- 回归:regression
- regression
- regression_l1
- huber
- fair
- poisson
- quantile
- mape
- gamma
- tweedie
- 逻辑回归分类:binary:二分类0或者1
- binary
- 多类分类:multi-class classification:多分类
- multiclass
- multiclassova
- num_class
- 概率标签分类:cross-entropy:标签是0-1之间的概率
- cross_entropy
- cross_entropy_lambda
- 排名:lambdarank
- lambdarank
- 回归:regression
-
boosting参数
- 缺省值:gbdt,
- 类型:enum
- 取值列表:
- gbdt
- rf
- dart
- goss
-
num_iterations参数
- 迭代次数
-
learning_rate参数
- 学习率
-
num_leaves参数
- 叶子数量
3. 控制参数
参数很多,下面列出与过拟合有关的参数
-
过拟合:
- 树的最大深度max_depth,主要用来避免模型的过拟合,设为负数值则表明不限制
- 叶节点的最少样本数min_data_in_leaf
- 叶节点的最小海森值之和min_sum_hessian_in_leaf
- 列采样feature_fraction,每棵树的特征子集占比,设置在0~1之间,可以加快训练速度,避免过拟合
- 行采样bagging_fraction,不进行重采样的随机选取部分样本数据,此外需要设置参数bagging_freq来作为采样的频率,即多少轮迭代做一次bagging;
- 早停止early_stopping_roung,在某一验证数据的某一验证指标当前最后一轮迭代没有提升时停止迭代
- 正则化
- lambda_l1
- lambda_l2
- 切分的最小收益min_gain_to_split
4. 评估度量参数
- 度量参数一般通过误差计算来评估模型的优劣。所以度量参数都对应核心参数中的objective参数。参数太多不一一罗列。需要的是查找官方文档。
5. 使用参数的例子
import sklearn.datasets as ds
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np
data, target = ds.load_iris(return_X_y=True)
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
print(X_test.shape)
lgb_train = lgb.Dataset(X_train,label = y_train)
# 训练
params = {
'task': 'train',
'objective': 'multiclass',
'num_class': 3,
}
gbm = lgb.train(params, train_set=lgb_train) # 返回模型
# 预测数据集
y_pred = gbm.predict(X_test)
# 评估模型
print(y_pred.argmax(axis=1))
print(y_test)
e_result = y_pred.argmax(axis=1) == y_test
print(F"准确数:{e_result.sum()}")
(30, 4)
[1 1 1 2 2 0 2 1 2 0 2 2 0 0 2 1 0 0 1 2 2 1 2 2 0 0 0 0 1 2]
[1 2 1 2 2 0 2 1 1 0 2 2 0 0 1 1 0 0 1 2 2 1 2 2 0 0 0 0 1 2]
准确数:27 # 优化前的效果
四.参数与调优例子
1. 准确率
import sklearn.datasets as ds
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np
import pandas as pd
data, target = ds.load_iris(return_X_y=True)
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
lgb_train = lgb.Dataset(X_train,label = y_train)
params = {
'task': 'train',
'objective': 'multiclass',
'num_class': 3,
}
min_merror = float('Inf') # 无穷
best_params = {} # 存放最优参数
for num_leaves in range(20,50):
for max_depth in range(3,8):
params['num_leaves'] = num_leaves
params['max_depth'] = max_depth
cv_results = lgb.cv(
params,
lgb_train,
seed=2018,
nfold=3,
metrics=['multi_error'],
early_stopping_rounds=30,
verbose_eval=False)
mean_merror = pd.Series(cv_results['multi_error-mean']).min()
boost_rounds = pd.Series(cv_results['multi_error-mean']).idxmin()
if mean_merror < min_merror:
min_merror = mean_merror
best_params['num_leaves'] = num_leaves
best_params['max_depth'] = max_depth
params['num_leaves'] = best_params['num_leaves']
params['max_depth'] = best_params['max_depth']
print(params['num_leaves'], params['max_depth'] )
20 3 # 优化的最佳叶子节点数量(20)与深度(3)
- 使用最优参数的效果
gbm = lgb.train(params, train_set=lgb_train) # 返回模型
# 预测数据集
y_pred = gbm.predict(X_test)
# 评估模型
print(y_pred.argmax(axis=1))
print(y_test)
e_result = y_pred.argmax(axis=1) == y_test
print(F"准确数:{e_result.sum()}")
[0 1 2 1 0 2 1 1 0 0 0 2 0 0 2 1 1 1 0 0 1 2 2 2 1 2 2 0 2 0]
[0 1 2 1 0 2 1 1 0 0 0 1 0 0 2 1 1 1 0 0 1 2 2 2 1 2 2 0 2 0]
准确数:29 # 优化后的效果。
- 可以对比误差分析,看看优化的误差改进。