import xgboost as xgb
def construct_model_data(X_train,X_validation,X_test,
y_train,y_validation,y_test):
dtrain = xgb.DMatrix(X_train.values,label = y_train.values,
feature_names=X_train.columns,missing=np.nan)
dvalidation = xgb.DMatrix(X_validation.values,label = y_validation.values, feature_names=X_train.columns,missing=np.nan)
dtest = xgb.DMatrix(X_test.values, label = y_test.values,
feature_names=X_test.columns,missing=np.nan)
return dtrain,dtest,dvalidation
def train_model(parameters,num_round,dtrain,dtest,dvalidation,es=10000):
watchlist = [(dtrain,'train'), (dtest,'test'), (dvalidation,'validation')]
bst = xgb.train(parameters, dtrain, num_round, evals=watchlist, early_stopping_rounds=es)
return bst
parameters={'eta':0.38, 'objective':'binary:logistic', 'eval_metric':'auc', 'max_depth':5,
'min_child_weight':240, 'scale_pos_weight':11, 'colsample_bylevel':0.7, 'subsample':0.78,
'colsample_bytree':0.7, 'seed':2019, 'tree_method':'approx', 'gamma':15, 'lambda':8600}
def ceate_feature_map(features): #查看重要特征
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
ceate_feature_map(X_train.columns)
import operator
importance = bst.get_score(fmap='xgb.fmap',importance_type='gain')
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)
import xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
xgb_classfier = xgboost.XGBClassifier(gamma=0.2,reg_alpha=0.5,reg_lambda=2,scale_pos_weight=1,random_state=0)
param_grid = {'n_estimators':[70,80,90], 'min_child_weight':[2,3,4,5], 'max_depth':[6,8,10]}
#交叉验证切分每一折按类别分层取样
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
grid_search = GridSearchCV(xgb_classifier, param_grid, n_jobs=8, cv=skf, verbose=10) #网格搜索+交叉验证
grid_search.fit(X_trainval, y_trainval)
best_parameters=grid_search.best_params_
print('GridSearchCV得的最优参数组合:',best_parameters,'\n')
#最好参数灌入模型,再训练一次模型,给出模型在训练验证集和测试集上的表现
xgb_config = xgboost.XGBClassifier(**best_parameters,gamma=0.2,reg_alpha=0.5,reg_lambda=2,scale_pos_weight=1,random_state=0)
xgbf = xgb_config.fit(X_trainval,y_trainval)
y_tvp=xgbf.predict(X_trainval)
print('最优参数对应的估计器,在trainval上的分类报告:\n',classification_report(y_tvp,y_trainval))
y_tp=xgbf.predict(X_test)
print('最优参数对应的估计器,在test上的分类报告:\n',classification_report(y_tp,y_test))