构造决策树根节点
from math import log
def createDataSet():
dataSet = [ [1,1,1,'no'],
[1,1,1,'yes'],
[1,1,1,'yes'],
[1,2,2,'yes'],
[2,2,2,'no'],
[2,2,2,'yes'],
[2,2,2,'yes'],
[2,3,2,'yes'],
[2,3,1,'no'],
[2,3,1,'no'],
[3,3,1,'yes'],
[3,2,2,'yes'],
[3,2,2,'yes'],
[3,2,1,'yes']]
labels = ['temperate', 'weather', 'windy','go_out']
return dataSet,labels
def calcShannonEnt(dataSet):
#返回数据集行数
numEntries=len(dataSet)
#保存每个标签(label)出现次数的字典
labelCounts={}
#对每组特征向量进行统计
for featVec in dataSet:
currentLabel=featVec[-1]#提取标签信息
if currentLabel not in labelCounts.keys():#如果标签没有放入统计次数
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1#label计数
shannonEnt=0.0
#计算经验熵
for key in labelCounts:
prob=float(labelCounts[key])/numEntries #选择该标签的概率
shannonEnt-=prob*log(prob,2) #利用公式计算
return shannonEnt
def chooseBestFeatureToSplit(dataSet):
#特征数量
numFeatures = len(dataSet[0]) - 1
#计数数据集的香农熵
baseEntropy = calcShannonEnt(dataSet)
#信息增益
bestInfoGain = 0.0
#最优特征的索引值
bestFeature = -1
#遍历所有特征
for i in range(numFeatures):
# 获取dataSet的第i个所有特征
featList = [example[i] for example in dataSet]
#创建set集合{},元素不可重复
uniqueVals = set(featList)
#经验条件熵
newEntropy = 0.0
#计算信息增益
for value in uniqueVals:
#subDataSet划分后的子集
subDataSet = splitDataSet(dataSet, i, value)
#计算子集的概率
prob = len(subDataSet) / float(len(dataSet))
#根据公式计算经验条件熵
newEntropy += prob * calcShannonEnt((subDataSet))
#信息增益
infoGain = baseEntropy - newEntropy
#打印每个特征的信息增益
print("第%d个特征的增益为%.3f" % (i, infoGain))
#计算信息增益
if (infoGain > bestInfoGain):
#更新信息增益,找到最大的信息增益
bestInfoGain = infoGain
#记录信息增益最大的特征的索引值
bestFeature = i
#返回信息增益最大特征的索引值
return bestFeature
def splitDataSet(dataSet,axis,value):
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
if __name__=='__main__':
dataSet,features=createDataSet()
print(dataSet)
print(calcShannonEnt(dataSet))
print("最优索引值为:"+str(chooseBestFeatureToSplit(dataSet)))
结果:然后,据此划分数据集,再对划分后的子数据集重复以上过程,直到满足当前数据集的每个样本同属一类或者所有特征已使用两个条件之一。
import operator
def major_class(class_list:list):
class_cnt = {}
for class_name in class_list:
class_cnt[class_name] = class_cnt.get(class_name,0) + 1
sorted_class = sorted(class_cnt.items(),key = operator.itemgetter(1),reverse = True)
return sorted_class[0][0]
以上函数是求子集中样本数量最多的类别。
构建决策树:
def create_tree(dataSet:list,labels:list):
class_list = [example[-1] for example in dataSet]
if len(dataSet[0]) ==1:
return major_class(class_list)
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
feat_ind = chooseBestFeatureToSplit(dataSet)
best_feat = labels[feat_ind]
tree = {best_feat:{}}
labels = labels[:feat_ind] + labels[feat_ind+1:]
feat_values = set([example[feat_ind] for example in dataSet])
for feat_value in feat_values:
sub_labels = labels[:]
retDataSet = splitDataSet(dataSet,feat_ind,feat_value)
tree[best_feat][feat_value] = create_tree(retDataSet,sub_labels)
return tree
import tree
dataSet, labels = tree.create_dataset()
decision_tree = create_tree(dataSet, labels)
先用sklearn生成一个.dot的文件,再使用graphviz包可实现决策树的可视化。