svmMLiA.py,为没有用启发式算法,随机选择alphas[i],alphas[j]的SMO算法的实现。
svmQuicken.py,为启用了启发是算法选择alphas[i],alphas[j]的SMO算法的实现。
代码写的有点乱,结果出来之前,没心思整理代码,结果出来后,就更没心思整理代码了。
(以下正确率的结果,都是由训练数据获得超平面之后,再拿训练数据去测试的。没有专门去整理测试数据)
一、线性可分-线性核
下图一是svmMLiA.py实现的,线性可分,随机选择alphas[i]和alphas[j],很慢,但效果很好。红色点大圈为支持向量点,正确率为0.92
下图二是svmQuicken.py实现的,线性可分,启发式算法选择alphas[i]和alphas[j],很快,效果还行,不如图一。红色点大圈为支持向量点,正确率为0.90
二、线性不可分-高斯核
下图都是svmMLiA.py实现的,随机选择alphas[i]和alphas[j]。图三、图四、图五分别是高斯核的参数sigma = 0.1、0.3、0.6得出的结果,红色的大圈为支持向量。显然,sigma越小,得到的支持向量点越多,结果越准确,如果支持向量太多,相当于每次都利用整个数据集进行分类,这时便成了K近邻算法了。sigma = 0.1、0.3、0.6的准确率分别是0.915254,0.830508,0.813559
svmMLiA.py
'''
Created on 2017年7月9日
@author: fujianfei
'''
from os.path import os
import numpy as np
#导入数据,数据集
def loadDataSet (fileName):
data_path = os.getcwd()+'\\data\\'
labelMat = []
svmData = np.loadtxt(data_path+fileName,delimiter=',')
dataMat=svmData[:,0:2]
#零均值化
# meanVal=np.mean(dataMat,axis=0)
# dataMat=dataMat-meanVal
label=svmData[:,2]
for i in range (np.size(label)):
if label[i] == 0 or label[i] == -1:
labelMat.append(float(-1))
if label[i] == 1:
labelMat.append(float(1))
return dataMat.tolist(),labelMat
#简化版SMO算法,不启用启发式选择alpha,先随机选
def selectJrand(i,m):
j=i
while (j==i):
j = int(np.random.uniform(0,m))#在0-m的随机选一个数
return j
#用于调整大于H或小于L的值,剪辑最优解
def clipAlpha(aj,H,L):
if aj > H:
aj = H
if L > aj:
aj = L
return aj
'''
定义核函数
kernelOption=linear 线性
kernelOption=rbf 高斯核函数
'''
def calcKernelValue(matrix_x, sample_x, kernelOption):
kernelType = kernelOption[0]
numSamples = matrix_x.shape[0]
kernelValue = np.mat(np.zeros((numSamples, 1)))
if kernelType == 'linear':
kernelValue = matrix_x.dot(sample_x.T)
elif kernelType == 'rbf':
sigma = kernelOption[1]
if sigma == 0:
sigma = 1.0
for i in range(numSamples):
diff = matrix_x[i, :] - sample_x
kernelValue[i] = np.exp(diff.dot(diff.T) / (-2.0 * sigma**2))
else:
raise NameError('Not support kernel type! You can use linear or rbf!')
return kernelValue
'''
简化版SMO算法。
dataMatIn:输入的数据集
classLabels:类别标签
C:松弛变量前的常数
toler:容错率
maxIter:最大循环数
'''
def smoSimple(dataMatIn,classLabels,C,toler,maxIter,kernelOption):
dataMatrix = np.mat(dataMatIn);labelMat = np.mat(classLabels).transpose()
b=0;m,n = np.shape(dataMatrix)
alphas = np.mat(np.zeros((m,1)))
iter = 0
while(iter < maxIter):
alphaPairsChanged = 0 #记录alpha值是否优化,即是否变化
for i in range(m):#遍历数据集,第一层循环,遍历所有的alpha
fXi = float(np.multiply(alphas,labelMat).T.dot(calcKernelValue(dataMatrix,dataMatrix[i,:],kernelOption))) + b
Ei = fXi - float(labelMat[i])
if (((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > 0))):
j = selectJrand(i, m)
fXj = float(np.multiply(alphas,labelMat).T.dot(calcKernelValue(dataMatrix,dataMatrix[j,:],kernelOption))) + b
Ej = fXj - float(labelMat[j])
alphaIold = alphas[i].copy();
alphaJold = alphas[j].copy();
if (labelMat[i] != labelMat[j]):
L = max(0,alphas[j] - alphas[i])
H = min(C,C+alphas[j] - alphas[i])
else:
L = max(0,alphas[j] + alphas[i] -C)
H = min(C,alphas[j] + alphas[i])
if(L == H):print('L==H');continue
eta = 2.0 * calcKernelValue(dataMatrix[i,:],dataMatrix[j,:],kernelOption) - calcKernelValue(dataMatrix[i,:],dataMatrix[i,:],kernelOption) - calcKernelValue(dataMatrix[j,:],dataMatrix[j,:],kernelOption)
if(eta >= 0):print('eta >= 0');('alpha[j]=%f###############################' % alphas[j]);continue
alphas[j] -= labelMat[j] * (Ei - Ej)/eta
alphas[j] = clipAlpha(alphas[j], H, L)
if (abs(alphas[j]-alphaJold) < 0.0001) : print('j not moving enough');continue
alphas[i] += labelMat[i]*labelMat[j]*(alphaJold - alphas[j])
b1 = b - Ei - labelMat[i]*(alphas[i] - alphaIold)*calcKernelValue(dataMatrix[i,:],dataMatrix[i,:],kernelOption) - labelMat[j]*(alphas[j]-alphaJold)*calcKernelValue(dataMatrix[j,:],dataMatrix[i,:],kernelOption)
b2 = b - Ej - labelMat[i]*(alphas[i] - alphaIold)*calcKernelValue(dataMatrix[i,:],dataMatrix[j,:],kernelOption) - labelMat[j]*(alphas[j]-alphaJold)*calcKernelValue(dataMatrix[j,:],dataMatrix[j,:],kernelOption)
if (0 < alphas[i] and (C > alphas[i])):b=b1
elif (0 < alphas[j] and (C > alphas[j])):b=b2
else:b=(b1+b2)/2.0
alphaPairsChanged += 1
print("iter:%d i:%d,pairs changed %d" % (iter,i,alphaPairsChanged))
if(alphaPairsChanged == 0) :
iter += 1
else :
iter = 0
print("iteration number:%d" % iter)
return b,alphas
**
svmQuicken.py
**
import numpy as np
import matplotlib.pyplot as plt
'''
Created on 2017年7月11日
@author: fujianfei
'''
class optStruct:
'''
定义common数据结构,存储需要用到的变量:
'''
def __init__(self, dataMatIn, classLabels, C, toler, kernelOption):
'''
X:训练的数据集
labelMat:X对应的类别标签
C:松弛变量系数
tol:容错率
m:样本的个数
alphas:拉格朗日系数,需要优化项
b:阈值
eCache:第一列 标志位,标志Ek是否有效,1为有效,0为无效 第二列 错误率Ek
K:核矩阵
kernelOption:核选项,如果是线性核kernelOption=('linear', 0) 如果是高斯核kernelOption=('rbf', sigma),sigma为高斯核参数
'''
self.X = dataMatIn
self.labelMat = classLabels
self.C = C
self.tol = toler
self.kernelOpt = kernelOption
self.m,self.n = np.shape(dataMatIn)
self.alphas = np.mat(np.zeros((self.m,1)))
self.b = 0
self.eCache = np.mat(np.zeros((self.m,2)))
self.K = np.mat(np.zeros((self.m,self.m)))
#事先把核矩阵都计算并存储好,避免以后多次计算
for i in range(self.m):
self.K[:,i] = calcKernelValue(self.X, self.X[i,:], kernelOption)
def calcEK(oS,k):
'''
计算误差Ek
'''
fXk = float(np.multiply(oS.alphas,oS.labelMat).T.dot(oS.K[:,k])) + oS.b
Ek = fXk - float(oS.labelMat[k])
return Ek
def selectJ(i,oS,Ei):
'''
启发式算法选择j,选择具有最大步长的j
'''
#1.定义步长maxDeltaE (Ei-Ek) 取得最大步长时的K值maxK 需要返回的Ej (具有最大步长 ,即|Ei-Ej|值最大)
maxK = -1; maxDeltaE = 0; Ej = 0
#2.将Ei保存到数据结构的eCache中去
oS.eCache[i] = [1,Ei]
#3.定义list validEcacheList,存放有效的Ek
validEcacheList = np.nonzero(oS.eCache[:,0].A)[0]
#4.判断 如果len(validEcacheList)>1 遍历validEcacheList,找到最大的|Ei-Ej|
if (len(validEcacheList) > 1):
for k in validEcacheList:
Ek = calcEK(oS, k)
deltaE = abs(Ei - Ek)
if (maxDeltaE < deltaE):
maxDeltaE = deltaE
maxK = k
Ej = Ek
return maxK,Ej
#5.否则就随机选择j
else:
print("---------------随机选择的j---------------------")
j = selectJrand(i,oS.m)
Ej = calcEK(oS, j)
return j,Ej
def updateEk(oS,k):
'''
计算并更新Ek值到缓存eCache中
'''
Ek = calcEK(oS, k)
oS.eCache[k] = [1,Ek]
def calcfXk(oS,k):
'''
计算误差fXk,数据集训练结束后,可用它来对testdate进行分类
'''
fXk = float(np.multiply(oS.alphas,oS.labelMat).T.dot(oS.K[:,k])) + oS.b
return fXk
def calcKernelValue(matrix_x, sample_x, kernelOption):
'''
定义核函数
kernelOption=linear 线性
kernelOption=rbf 高斯核函数
'''
kernelType = kernelOption[0]
numSamples = matrix_x.shape[0]
kernelValue = np.mat(np.zeros((numSamples, 1)))
if kernelType == 'linear':
kernelValue = matrix_x.dot(sample_x.T)
elif kernelType == 'rbf':
sigma = kernelOption[1]
if sigma == 0:
sigma = 1.0
for i in range(numSamples):
diff = matrix_x[i, :] - sample_x
kernelValue[i] = np.exp(diff.dot(diff.T) / (-2.0 * sigma**2))
else:
raise NameError('Not support kernel type! You can use linear or rbf!')
return kernelValue
def selectJrand(i,m):
'''
根据i,随机选择j
'''
j=i
while (j==i):
j = int(np.random.uniform(0,m))#在0-m的随机选一个数
return j
def clipAlpha(aj,H,L):
'''
用于调整大于H或小于L的值,剪辑最优解
'''
if aj > H:
aj = H
if L > aj:
aj = L
return aj
def innerL(i,oS):
'''
内循环,选定i后,在此函数根据启发式算法选定j,优化alphas[i],alphas[j]
计算优化后的Ei,Ej,b,最后再将它们全部存入数据结构optStruct
'''
Ei = calcEK(oS, i)
#判断优化前的alphas[i]是否满足KKT条件,如果不满足,进行优化(启发式算法选择i)
#看论坛上有人文,KKT条件有三个:alphas[i]=0;alphas[i]=C;0<alphas[i]<C;而这里只加了0<alphas[i]<C的判断是不是漏了等于0和等于C的情况
#其实alphas[i]=0和alphas[i]=C已经包含进了这个判断
#alphas[i]=0时满足oS.alphas[i] < oS.C,故而必要要满足oS.labelMat[i]*Ei < 0,两者一和起来不就是alphas[i]=0的KKT条件吗。同理,alphas[i]=C也是
if (((oS.labelMat[i]*Ei < -oS.tol) and (oS.alphas[i] < oS.C)) or ((oS.labelMat[i]*Ei > oS.tol) and (oS.alphas[i] > 0))):
#根据启发式算法选定j,并计算好对应的Ej
j,Ej = selectJ(i, oS, Ei)
print("启发式算法选出的 i = %d,j= %d" % (i,j))
#重新开辟两处内存,复制优化前的alphas[i]和alphas[j]
#因为后面判断优化后的alphas[j]是否有足够的变化,需要用到优化前的
alphaIold = oS.alphas[i].copy();alphaJold = oS.alphas[j].copy();
#计算alphas[j]的边界L,H
if (oS.labelMat[i] != oS.labelMat[j]):
L = max(0,oS.alphas[j] - oS.alphas[i])
H = min(oS.C,oS.C+oS.alphas[j] - oS.alphas[i])
else:
L = max(0,oS.alphas[j] + oS.alphas[i] - oS.C)
H = min(oS.C,oS.alphas[j] + oS.alphas[i])
#如果最小值L等于最大值H,则没必要再进行优化了,直接返回0
if(L == H):print('L=%d == H=%d' % (L,H));return 0
#计算eta
eta = 2.0 * oS.K[i,j] - oS.K[i,i] - oS.K[j,j]
#如过eta>=0,则可证明最优值在边界处取得,不需要再优化,直接返回0
#解释:-eta是我们构造的拉格朗日函数L的二阶导数,如果eta>0,二阶导数<0,L在区间内为单调函数,所以最优值在边界处取得
#最优值解alphas[j]就等于L或H,此时不需要优化,也可以将此时的alphas[j]和对应的alphas[i]保存到oS里
#但一般情况,不会出现这种情况,比如我们的线性核函数,可以想象成(x1+x2)^2,拆开后2*x1*x2 - x1^2 -X2^2 肯定是<0的。=0的情况太复杂,但基本不会出现,不考虑。
if(eta >= 0):
print('eta >= 0#################################################################################')
#不需要再优化,直接返回0
return 0
#更新alphas[j]
oS.alphas[j] -= oS.labelMat[j] * (Ei - Ej)/eta
oS.alphas[j] = clipAlpha(oS.alphas[j], H, L)
#更新Ej
updateEk(oS, j)
#如果alphas[j]的变化很小,可忽略,则不需再优化,直接返回0
if (abs(oS.alphas[j]-alphaJold) < 0.00001) :
print('j not moving enough')
#j没有变化足够的多,不需要再优化,直接返回0
return 0
#根据alphas[j]计算alphas[i]
oS.alphas[i] += oS.labelMat[i]*oS.labelMat[j]*(alphaJold - oS.alphas[j])
#更新Ei
updateEk(oS, i)
#计算阈值b
b1 = oS.b - Ei - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,i] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,i]
b2 = oS.b - Ej - oS.labelMat[i]*(oS.alphas[i] - alphaIold)*oS.K[i,j] - oS.labelMat[j]*(oS.alphas[j]-alphaJold)*oS.K[j,j]
if (0 < oS.alphas[i] and (oS.C > oS.alphas[i])):oS.b=b1
elif (0 < oS.alphas[j] and (oS.C > oS.alphas[j])):oS.b=b2
else:oS.b=(b1+b2)/2.0
return 1
else:print("alphas[i]在容错范围内,不需优化");return 0
def isFitKKT(oS):
'''
判断是否在精度范围内符合KKT条件,符合返回True.作为停机的最后验证条件.精度为oS.tol
如果符合KKT条件,那么找出的alphas,一定为最优解
'''
for i in range(oS.m):
#如果alphas小于0 或 大于C,不满足KKT条件,直接返回False
if (oS.alphas[i] < 0 or oS.alphas[i] > oS.C) : return False
#如果不满足KKT的核心条件,之间返回False
if ((oS.alphas[i] == 0 and calcfXk(oS,i) * oS.labelMat[i] < 1) or (oS.alphas[i] > 0 and oS.alphas[i] < oS.C and abs(calcfXk(oS,i) * oS.labelMat[i] - 1) > oS.tol) or (oS.alphas[i] == oS.C and calcfXk(oS,i) * oS.labelMat[i] > 1)) : return False
#如果上面两个条件都满足了,最后再满足alphas*labelMat之和等于0,则便返回True,符合KKT条件
return abs(oS.alphas.T.dot(oS.labelMat)) < oS.tol
def smoP(dataMatIn,classLabels,C,toler,maxIter,kernelOption):
'''
smo优化后的算法
dataMatIn:训练的数据集
classLabels:类别标签
C:松弛变量系数
toler:容错率
kernelOption:核选项,如果是线性核kernelOption=('linear', 0) 如果是高斯核kernelOption=('rbf', sigma),sigma为高斯核参数
'''
oS = optStruct(np.mat(dataMatIn),np.mat(classLabels).T,C,toler, kernelOption)#初始化oS
iter = 0
entireSet = True;alphaPairsChanged = 0
while (iter < maxIter) and ((alphaPairsChanged > 0) or (entireSet)) :
alphaPairsChanged = 0
if entireSet : #遍历所有的值
for i in range(oS.m) :
alphaPairsChanged += innerL(i, oS)
print("fullSet, iter : %d i : %d, paris changes %d" % (iter,i,alphaPairsChanged))
print(isFitKKT(oS))
iter += 1
else :#遍历非边界上的值(支持向量机)
nonBoundIs = np.nonzero((oS.alphas.A >0) * (oS.alphas.A < C))[0]
for i in nonBoundIs :
alphaPairsChanged += innerL(i, oS)
print("non-bound, iter : %d i : %d, paris changes %d" % (iter,i,alphaPairsChanged))
print(isFitKKT(oS))
iter += 1
if entireSet : entireSet = False
elif (alphaPairsChanged == 0) : entireSet = True
print("iteration number : %d" % iter)
print("-#-#-#-#-#-#-#-#-#_#-#-#_#-#_##_#_#_#_#_#_#_#__#_#_#_#_#_")
print(isFitKKT(oS))
return oS
def showResult(oS):
'''
画图
'''
w=np.multiply(oS.alphas,oS.labelMat).T.dot(oS.X)
w=np.mat(w)
x1=oS.X[:,0]
y1=oS.X[:,1]
x2=range(20,100)
b=float(oS.b)
w0=float(w[0,0])
w1=float(w[0,1])
# y2 = [-b/w1-w0/w1*elem for elem in x2]
# plt.plot(x2, y2)
for i in range(oS.m):
if ((oS.alphas[i] < oS.C) and (oS.alphas[i] > 0)):
print('########################')
print(oS.alphas[i])
plt.scatter(x1[i], y1[i],s=60,c='red',marker='o',alpha=0.5,label='SV')
if int(oS.labelMat[i]) == -1:
plt.scatter(x1[i], y1[i],s=30,c='red',marker='.',alpha=0.5,label='-1')
elif int(oS.labelMat[i]) == 1:
plt.scatter(x1[i], y1[i],s=30,c='blue',marker='x',alpha=0.5,label='+1')
plt.show()
def testSVM(svm, test_x, test_y):
'''
测试训练后结果正确率
'''
test_x = np.mat(test_x)
test_y = np.mat(test_y).T
numTestSamples = test_x.shape[0]
supportVectorsIndex = np.nonzero(svm.alphas.A > 0)[0]
supportVectors = svm.X[supportVectorsIndex]
supportVectorLabels = svm.labelMat[supportVectorsIndex]
supportVectorAlphas = svm.alphas[supportVectorsIndex]
matchCount = 0
for i in range(numTestSamples):
kernelValue = calcKernelValue(supportVectors, test_x[i, :], svm.kernelOpt)
predict = kernelValue.T.dot(np.multiply(supportVectorLabels, supportVectorAlphas)) + svm.b
if np.sign(predict) == np.sign(test_y[i]):
matchCount += 1
accuracy = float(matchCount) / numTestSamples
return accuracy
init.py
from SVM import svmMLiA,svmQuicken
import numpy as np
if __name__ == '__main__':
dataMatIn,classLabels = svmMLiA.loadDataSet('data2.txt')
C=0.6
toler=0.001
maxIter = 40
#用启发式算法版本,速度快,但是效果不好
oS = svmQuicken.smoP(dataMatIn, classLabels, C, toler, maxIter, ('linear', 0))
#用启发式算法版本,速度快,但是效果不好
#不用启发式算法,速度慢,但是效果好
# b,alphas = svmMLiA.smoSimple(dataMatIn, classLabels, C, toler, maxIter, ('rbf', 0.05))
# oS = svmQuicken.optStruct(np.mat(dataMatIn),np.mat(classLabels).T,C,toler, ('rbf', 0.05))#初始化oS
# oS.alphas = alphas
# oS.b = b
#不用启发式算法,速度慢,但是效果好
print(oS.alphas)
#计算正确率
rightRate = svmQuicken.testSVM(oS,dataMatIn, classLabels)
#画图
svmQuicken.showResult(oS)
print("正确率是 : %f" % rightRate)
参考文献:
[1]李航的《统计学习方法》,清华大学出版社