随机森林

随机森林(RandomForest), 可用于分类或者回归，相比较决策树的算法，随机森林是由多棵CART（Classification And Regression Tree）构成的。优点很明显：

决策树过拟合问题严重， RandomForest不存在，所以也避免了剪枝的问题；抗噪性好(两个随机采样)
速度快，模型简单，精度高
能高效应对特征缺失的情况，标量特征和连续特征通吃，无需归一化
GINI系数评估变量属性重要性，并给出连续变量的分界值
在不平衡的数据集中, 它含有平衡误差的方法。
可生成一个Proximities=（pij）矩阵，用于度量样本之间的相似性： pij=aij/N, aij表示样本i和j出现在随机森林中同一个叶子结点的次数，N随机森林中树的颗数
计算样本实例之间的Proximities,可以用来聚类分析、异常分析、或者数据的其他有趣的视图。

缺点：

递归过深，吃内存，容易堆栈溢出

思想：
两个随机采样：

样本采样：假设原始样本数为N,用bootstrap方法从N样本中获取构造每棵决策树的训练集。有放回采样
特征采样：无放回抽样

随机森林: 分类用投票vote, 回归用回归树的平均值

森林中单颗树的分类强度（Strength）：每颗树的分类强度越大，则随机森林的分类性能越好。
森林中树之间的相关度（Correlation）：树之间的相关度越大，则随机森林的分类性能越差。
随机选取训练样本集：使用Bagging方法形成每颗树的训练集

随机选取分裂属性集：假设共有p个属性，指定一个属性数p≤m，在每个内部结点，从M个属性中随机抽取F个属性作分裂属性集，以这p个属性上最好的分裂方式对结点进行分裂（在整个森林的生长过程中， F的值一般维持不变）

细节：
用作分类时，m默认取 sqrt(p)，最小取1；
用作回归时，m默认取 p / 3，最小取5

代码实现：

# !/usr/bin/pytho
# encoding=utf-8
from __future__ import division
import numpy as np
import math
import copy

"""
http://blog.csdn.net/ljblog2014/article/details/38875137
随机森林: 分类用投票vote, 回归用回归树的平均值
森林中单颗树的分类强度（Strength）：每颗树的分类强度越大，则随机森林的分类性能越好。
森林中树之间的相关度（Correlation）：树之间的相关度越大，则随机森林的分类性能越差。
随机选取训练样本集：使用Bagging方法形成每颗树的训练集

随机选取分裂属性集：假设共有p个属性，指定一个属性数p≤m，在每个内部结点，从M个属性中随机抽取F个属性作分裂属性集，以这p个属性上最好的分裂方式对结点进行分裂（在整个森林的生长过程中， F的值一般维持不变）

细节：
用作分类时，m默认取 sqrt(p)，最小取1；
用作回归时，m默认取 p / 3，最小取5
m是一个可调的参数
RandomForestClassifier(
            n_estimators=10,   //树的棵树
            criterion='gini',      //分类标准
            max_depth=None,  //最大深度
             min_samples_split=2,  //最少分裂几个子节点
             min_weight_fraction_leaf=0.0,
            max_leaf_nodes=None,
             bootstrap=True,
            n_jobs=1,    //指定并行使用的进程数
            random_state=None,
            verbose=0,
             warm_start=False,
             class_weight=None  //类别权重，样本不均衡时很重要
)

这个代码每棵树采用由放回的随机采样, feature用的全部的feature
"""


class node(object):
    def __init__(self, col=-1, value=None, results=None, trueBranch=None, falseBranch=None):
        self.col = col
        self.value = value
        self.results = results
        self.trueBranch = trueBranch
        self.falseBranch = falseBranch

    def getLabel(self):
        """
        如果不限制树的深度的话, 叶子节点就是单一类别
        如果限制树的深度的话, 叶子节点取多数样本的类别
        :return:
        """
        label = None
        if self.results == None:
            return None
        else:
            max_counts = 0
            for key in self.results.keys():
                if self.results[key] > max_counts:
                    label = key
                    max_counts = self.results[key]
        return label


class RandomForestsClassifier:
    def __init__(self, n_bootstrapSamples=20):
        self.n_bootstrapSamples = n_bootstrapSamples  # 决策树的数量
        self.list_tree = []

    def divideSet(self, samples, column, value):
        splitFunction = None
        if isinstance(value,int) or isinstance(value,float):
            splitFunction = lambda row: row[column] >= value
        else:
            splitFunction = lambda row: row[column] == value
        set1 = [row for row in samples if splitFunction(row)]
        set2 = [row for row in samples if not splitFunction(row)]
        return (set1,set2)

    def uniqueCounts(self, samples):
        # 统计每个类别下的样本数
        results = {}
        for row in samples:
            r = row[len(row)-1]
            if r not in results:
                results[r] = 0
            results[r] = results[r]+1
        return results

    def giniEstimate(self, samples):
        if len(samples) == 0: return 0
        total = len(samples)
        counts = self.uniqueCounts(samples)
        gini = 0
        for target in counts:
            gini = gini + pow(counts[target],2)
        gini = 1 - gini / pow(total,2)
        return gini

    def buildTree(self, samples): # 构造CART决策树
        if len(samples) == 0:
            return node()
        currentGini = self.giniEstimate(samples)
        bestGain = 0
        bestCriteria = None
        bestSets = None
        colCount = len(samples[0]) - 1
        colRange = range(0,colCount)
        np.random.shuffle(colRange)
        for col in colRange[0:int(math.ceil(math.sqrt(colCount)))]:
            colValues = {}
            for row in samples:
                colValues[row[col]] = 1
            for value in colValues.keys():
                (set1,set2) = self.divideSet(samples, col, value)
                gain = currentGini - (len(set1)*self.giniEstimate(set1) + len(set2)*self.giniEstimate(set2)) / len(samples)
                if gain > bestGain and len(set1) > 0 and len(set2) > 0:  # 最合适的gain系数
                    bestGain = gain
                    bestCriteria = (col, value)
                    bestSets = (set1, set2)
        if bestGain > 0:
            trueBranch = self.buildTree(bestSets[0])
            falseBranch = self.buildTree(bestSets[1])
            return node(col=bestCriteria[0], value=bestCriteria[1],
                        trueBranch=trueBranch, falseBranch=falseBranch)
        else:
            return node(results=self.uniqueCounts(samples))

    def printTree(self, tree,indent='  '):  # 以文本形式显示决策树
        if tree.results != None:
            print str(tree.results)
        else:
            print str(tree.col)+':'+str(tree.value)+'?'
            print indent+'T->',
            self.printTree(tree.trueBranch,indent+'  ')
            print indent+'F->',
            self.printTree(tree.falseBranch,indent+'  ')

    def predict_tree(self, observation, tree):  # 利用决策树进行分类
        if tree.results != None:
            return tree.getLabel()
        else:
            v = observation[tree.col]
            branch = None
            if isinstance(v,int) or isinstance(v,float):
                if v >= tree.value: branch = tree.trueBranch
                else: branch = tree.falseBranch
            else:
                if v == tree.value: branch = tree.trueBranch
                else: branch = tree.falseBranch
            return self.predict_tree(observation,branch)

    def generateBootstrapSamples(self, data): # 构造bootstrap样本
        samples = []
        # 增加特征的随机选取过程: 特征随机无放回
        features = len(data[0][:-1])
        randRange = range(features)
        np.random.shuffle(range(features))
        # 特征取sqart数量
        sample_features = randRange[:max(1, int(math.ceil(math.sqrt(features))))]

        # 这个是有放回的样本采样
        for i in range(len(data)):
            # np.random.randint产生0~len(data)的随机数, 有放回的随机采样, 即samples存在重复数据
            temp = data[np.random.randint(len(data))]
            tp = []
            for j in range(len(temp[:-1])):
                if j in sample_features:
                    tp.append(temp[j])
            tp.append(temp[-1])
            samples.append(data[np.random.randint(len(data))])
        return samples

    def fit(self, data): # 构造随机森林
        for i in range(self.n_bootstrapSamples):
            # 构造self.n_bootstrapSamples 棵CART树
            samples = self.generateBootstrapSamples(data)
            currentTree = self.buildTree(samples)
            self.list_tree.append(currentTree)

    def predict_randomForests(self, observation): # 利用随机森林对给定观测数据进行分类
        results = {}
        for i in range(len(self.list_tree)):
            # 单决策树预测
            currentResult = self.predict_tree(observation, self.list_tree[i])
            # 投票表决
            if currentResult not in results:
                results[currentResult] = 0
            results[currentResult] = results[currentResult] + 1
        max_counts = 0
        finalResult = None
        # 投票
        for key in results.keys():
            if results[key] > max_counts:
                finalResult = key
                max_counts = results[key]
        return finalResult


from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
#pp = lambda x: x > 2 and 1 or -1
#X = np.array([[t[0], t[1], t[2], pp(t[-1])] for t in X.tolist()])
#print X
y = iris.target
temp_data = np.concatenate([X, y.reshape((len(y), 1))], axis=1)
# 由于上述代码要求输入的观测数据存储在二维列表中，需将numpy二维数组转换成列表

data = []
for i in range(temp_data.shape[0]):
    temp = []
    for j in range(temp_data.shape[1]):
        temp.append(temp_data[i][j])
    data.append(temp)

rowRange = range(150)
np.random.shuffle(rowRange)    # shuffle随机排序

#从鸢尾花数据集(容量为150)按照随机均匀抽样的原则选取70%的数据作为训练数据
training_data = [data[i] for i in rowRange[0:105]]
#按照随机均匀抽样的原则选取30%的数据作为检验数据
testing_data = [data[i] for i in rowRange[105:150]]
classifier = RandomForestsClassifier(n_bootstrapSamples=20) # 初始化随机森林
classifier.fit(training_data)#利用训练数据进行拟合
finalResults = []
for row in testing_data:
    finalResult = classifier.predict_randomForests(row[:-1])#对检验数据集进行分类
    finalResults.append(finalResult)
errorVector = np.zeros((len(testing_data),1))   # np.zeros 错误率
errorVector[np.array(finalResults) != (np.array(testing_data))[:,4]] =1
print errorVector.sum() / len(testing_data)#计算错判率


import sklearn.ensemble as ens

iris = load_iris()
clf = ens.RandomForestClassifier(n_estimators=20) #初始化

# RandomForestClassifier 不允许 字符串变量
clf = clf.fit([t[:-1] for t in training_data], [t[-1] for t in training_data]) #训练决策树
print clf.feature_importances_
print clf.n_classes_
print clf.estimators_
print clf.predict_log_proba([3.2, 2.5, 2.1, 2.0])
print clf.score((np.array(testing_data))[:,0:-1], (np.array(testing_data))[:,-1])

finalResults = []
for row in testing_data:
    finalResult = clf.predict(row[:-1])[0]#对检验数据集进行分类
    finalResults.append(finalResult)
errorVector = np.zeros((len(testing_data),1))   # np.zeros 错误率
errorVector[np.array(finalResults) != (np.array(testing_data))[:,4]] =1
print errorVector.sum() / len(testing_data)#计算错判率

sklearn可视化：

参考：
http://blog.csdn.net/a819825294/article/details/51177435
http://blog.csdn.net/ljblog2014/article/details/38875137

推荐阅读更多精彩内容