更多干货资料分享^_^
这篇文章只给出相关的实现代码和基本的寻参方法。这里用到了 sklearn 包。
# coding: utf-8
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
class RF:
def __init__(self):
# 对参数 n 进行寻参,这里的参数范围是根据实际情况定义的
self.n_estimators_options = [100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]
self.best_n_estimators = 0
self.best_acc = 0
def train(self, mall_id, X, shop_ids, TEST, row_ids):
# 处理标签
lbl = preprocessing.LabelEncoder()
lbl.fit(shop_ids)
y = lbl.transform(shop_ids)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 寻参
for n_estimators_size in self.n_estimators_options:
alg = RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators_size)
alg.fit(X_train, y_train)
predict = alg.predict(X_test)
acc = (y_test == predict).mean()
# 更新最优参数和 acc
if acc >= self.best_acc:
self.best_acc = acc
self.best_n_estimators = n_estimators_size
print('[n_estimators, acc]:', n_estimators_size, acc)
# 用最优参数进行训练
rf = RandomForestClassifier(n_jobs=-1, n_estimators=self.n_estimators)
rf.fit(X, y)
# 预测标签
predict = rf.predict(TEST)
# 预测概率
# predict_prob = rf.predict_prob(TEST)
# 转换为预测标签为真实标签
predict = [lbl.inverse_transform(int(x)) for x in predict]