朴素贝叶斯分类器底层实现(python)

# coding:utf-8
from collections import defaultdict
import numpy as np

class NativeByes(object):
    def __init__(self):
        # 词的文档频率
        self._dp_dict = None
        # 字典
        self._word_dict = None
        # 各个分类的占比,p(c)
        self._pc_dict = None
        # p(w|c),词在指定分类下的占比
        self._pwc_dict = None

    def create_dataset(self):
        data_set = [['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'],
                    ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                    ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                    ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                    ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                    ['quite', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        lables = [0, 1, 0, 1, 0, 1]
        return data_set, lables

    def crate_worddic(self, data_set):
        '''
        根据文档创建字典
        :param data_set: 
        :return: 
        '''
        s = set([])
        dp_dict = defaultdict(int)
        for data in data_set:
            tmp_set = set(data)
            for word in tmp_set:
                dp_dict[word] += 1
            s = s | tmp_set
        return list(s), dp_dict

    def build_vec(self, words, word_dic):
        '''
        使用字典,将词条转成字典向量(词袋模型)
        :param words: 
        :param word_dic: 
        :return: 
        '''
        word_dic = self._word_dict
        dp_dict = self._dp_dict
        vec = np.zeros(len(word_dic))
        for word in words:
            if word in word_dic:
                dp = dp_dict[word]
                vec[word_dic.index(word)] += (1.0 / dp)
        return vec

    def train(self, data_set, labels):
        '''
        训练数据集
        :param data_set: 
        :param labels: 
        :return: 
        '''
        word_dict, dp_dict = self.crate_worddic(data_set)
        self._word_dict = word_dict
        self._dp_dict = dp_dict
        matrix = [self.build_vec(word, word_dict) for word in data_set]
        # p(c) ,key :类别, value:出现次数
        pc_dict = defaultdict(int)
        for i in range(len(data_set)):
            label = labels[i]
            pc_dict[label] += 1
        cl_num = len(word_dict)
        # p(wn|cn)
        pwc_dict = defaultdict(int)
        for i in range(cl_num):
            for j in range(len(matrix)):
                vec = matrix[j]
                v = int(vec[i])
                key = 'c{}-f{}-v{}'.format(labels[j], str(i), v)
                pwc_dict[key] += 1

        self._pc_dict = pc_dict
        self._pwc_dict = pwc_dict

    def classify(self, words):
        '''
        对词条向量进行分类
        :param words: 
        :return: 
        '''
        word_dict = self._word_dict
        pc_dict = self._pc_dict
        pwc_dict = self._pwc_dict
        vec = self.build_vec(words, word_dict)
        vec_len = len(vec)
        max_pc = 0
        label = ''
        for c, pc in pc_dict.items():
            pwc = 0
            for i in range(vec_len):
                v = int(vec[i])
                key = 'c{}-f{}-v{}'.format(c, i, v)
                cur_pwc = pwc_dict[key]
                if cur_pwc == 0:
                    pass
                pwc += np.math.log(cur_pwc + 1)
            p = pc * pwc
            if p > max_pc:
                label = c
                max_pc = p
        return label


if __name__ == '__main__':
    byes = NativeByes()
    data_set, labels = byes.create_dataset()
    byes.train(data_set, labels)
    vec = ['dog', 'stupid', 'dog']
    lb = byes.classify(vec)
    for i in range(len(data_set)):
        vec = data_set[i]
        real_label = labels[i]
        lb = byes.classify(vec)
        print '{},cls is {},real is {}'.format(lb == real_label, lb, real_label)


最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容