ID3算法
通过信息增益的大小来决定优先选择哪个特征进行分
计算下图的信息熵,确定下一个分类的特征
image.png
# 导包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 先计算账号是否真实列 本身的信息熵
# sum( -p(x) * log(p(x)) )
# 2种结果:
# yes是7个
# no是3个
info_D = (-0.7)*np.log2(0.7) + (-0.3)*np.log2(0.3)
info_D
# 日志密度L 求期望
# s 0.3 1个yes,2个no
# m 0.4 3个yes,1个no
# l 0.3 全是yes
'''
info_D_L = 0.3 * ( (-1/3)*np.log2(1/3) + (-2/3)*np.log2(2/3) )
+ 0.4 * ( (-3/4)*np.log2(3/4) + (-1/4)*np.log2(1/4) )
+ 0.3 * ( (-3/3)*np.log2(3/3) )
'''
info_D_L = 0.3 * ( (-1/3)*np.log2(1/3) + (-2/3)*np.log2(2/3) ) + 0.4 * ( (-3/4)*np.log2(3/4) + (-1/4)*np.log2(1/4) )
info_D_L
# 求日志密度L信息增益: info_D - info_D_L
gain_L = info_D - info_D_L
gain_L