自己造轮子系列今天造的是AdaBoost,基分类器用的是DS(decision stump)。之所以会写这个系列主要是我觉得一方面可以锻炼coding能力,而另一方面也有助于算法的理解,毕竟懂的自己推导和理解含义再到实现感觉是不一样的。
from numpy import *
#decision stump Classifier
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
retArray = ones((shape(dataMatrix)[0],1))
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:,dimen] > threshVal] = -1.0
return retArray
def buildStump(dataArr, classLabels, D):#D权重向量,方便在AdaBoost中调用,三层循环,一层循环特征,一层循环步长,一层循环不等号
dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m,1)))
minError = inf
for i in range(n):
rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
stepSize = (rangeMax - rangeMin) / numSteps
for j in range(-1, int(numSteps) + 1):
for inequal in ['lt', 'gt']:
threshVal = (rangeMin + float(j) * stepSize)
predictedVal = stumpClassify(dataMatrix, i , threshVal, inequal)
errArr = mat(ones((m,1)))
errArr[predictedVal == labelMat] = 0
weightedError = D.T * errArr
#print('split: dim %d, thresh %.2f, thresh inequal:\
#%s, the weighted error is : %.3f' %(i, threshVal, inequal,weightedError))
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVal.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst
#adaboost的本体,50个基分类器,如果ein已经为0则break
def adaBoostTrainDS(dataArr,classLabels, numIt = 50):
weakClassArr = []#训练出来的基分类器保存在weakClassArr
m = shape(dataArr)[0]
D = mat(ones((m,1))/m)
aggClassEst = mat(zeros((m,1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D)
#print('D:',D.T)
alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)
#print('classEst:',classEst.T)
expon = multiply(-1 * alpha * mat(classLabels).T, classEst)
D = multiply(D, exp(expon))
D = D / D.sum()
aggClassEst += alpha * classEst
#print('aggClassEst:', aggClassEst.T)
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1)))
errorRate = aggErrors.sum() / m
#print('total error:' ,errorRate,'\n')
if errorRate == 0.0:break
return weakClassArr
#构建的分类器,sign(Σα*基分类器)
def adaClassify(datToClass, classifierArr):
dataMatrix = mat(datToClass)
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m,1)))
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
classifierArr[i]['thresh'],\
classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha'] * classEst
print(aggClassEst)
return sign(aggClassEst)