13 ML tree regression

from numpy import *

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = map(float, curLine) #map all elements to float()
        dataMat.append(fltLine)
    return dataMat

def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],  :][0]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0], :][0]
    return mat0, mat1

#returns the value used for each leaf
# get leaf node
def regLeaf(dataSet):
    return mean(dataSet[:,-1])

# calc error 
# get var 
def regErr(dataSet):
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]; # error limit
    tolN = ops[1]  # lesat sample num

    #if all the target variables are the same value: quit and return value
    #exit cond 1 -> smaple num == 1 return 
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
        return None, leafType(dataSet)

    m,n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = inf; 
    bestIndex = 0; 
    bestValue = 0
    
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)

            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): 
                continue

            newS = errType(mat0) + errType(mat1)
            if newS < bestS: 
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS

    #if the decrease (S-bestS) is less than a threshold don't do the split
    #exit cond 2
    if (S - bestS) < tolS: 
        return None, leafType(dataSet) 

    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)

    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #exit cond 3
        return None, leafType(dataSet)

    #returns the best feature to split on
    #and the value used for that split
    return bestIndex,bestValue

def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    
    if feat == None: 
        return val #if the splitting hit a stop condition return val

    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val

    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree  

myData = loadDataSet('ex00.txt')
myMat  = mat(myData)
myTree = createTree(myMat)
print('myData', myData[0:2])
print('myMat',  myMat[0:2])
print('myTree', myTree)
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容