# coding:utf-8
from collections import defaultdict
import numpy as np
class NativeByes(object):
def __init__(self):
# 词的文档频率
self._dp_dict = None
# 字典
self._word_dict = None
# 各个分类的占比,p(c)
self._pc_dict = None
# p(w|c),词在指定分类下的占比
self._pwc_dict = None
def create_dataset(self):
data_set = [['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quite', 'buying', 'worthless', 'dog', 'food', 'stupid']]
lables = [0, 1, 0, 1, 0, 1]
return data_set, lables
def crate_worddic(self, data_set):
'''
根据文档创建字典
:param data_set:
:return:
'''
s = set([])
dp_dict = defaultdict(int)
for data in data_set:
tmp_set = set(data)
for word in tmp_set:
dp_dict[word] += 1
s = s | tmp_set
return list(s), dp_dict
def build_vec(self, words, word_dic):
'''
使用字典,将词条转成字典向量(词袋模型)
:param words:
:param word_dic:
:return:
'''
word_dic = self._word_dict
dp_dict = self._dp_dict
vec = np.zeros(len(word_dic))
for word in words:
if word in word_dic:
dp = dp_dict[word]
vec[word_dic.index(word)] += (1.0 / dp)
return vec
def train(self, data_set, labels):
'''
训练数据集
:param data_set:
:param labels:
:return:
'''
word_dict, dp_dict = self.crate_worddic(data_set)
self._word_dict = word_dict
self._dp_dict = dp_dict
matrix = [self.build_vec(word, word_dict) for word in data_set]
# p(c) ,key :类别, value:出现次数
pc_dict = defaultdict(int)
for i in range(len(data_set)):
label = labels[i]
pc_dict[label] += 1
cl_num = len(word_dict)
# p(wn|cn)
pwc_dict = defaultdict(int)
for i in range(cl_num):
for j in range(len(matrix)):
vec = matrix[j]
v = int(vec[i])
key = 'c{}-f{}-v{}'.format(labels[j], str(i), v)
pwc_dict[key] += 1
self._pc_dict = pc_dict
self._pwc_dict = pwc_dict
def classify(self, words):
'''
对词条向量进行分类
:param words:
:return:
'''
word_dict = self._word_dict
pc_dict = self._pc_dict
pwc_dict = self._pwc_dict
vec = self.build_vec(words, word_dict)
vec_len = len(vec)
max_pc = 0
label = ''
for c, pc in pc_dict.items():
pwc = 0
for i in range(vec_len):
v = int(vec[i])
key = 'c{}-f{}-v{}'.format(c, i, v)
cur_pwc = pwc_dict[key]
if cur_pwc == 0:
pass
pwc += np.math.log(cur_pwc + 1)
p = pc * pwc
if p > max_pc:
label = c
max_pc = p
return label
if __name__ == '__main__':
byes = NativeByes()
data_set, labels = byes.create_dataset()
byes.train(data_set, labels)
vec = ['dog', 'stupid', 'dog']
lb = byes.classify(vec)
for i in range(len(data_set)):
vec = data_set[i]
real_label = labels[i]
lb = byes.classify(vec)
print '{},cls is {},real is {}'.format(lb == real_label, lb, real_label)
朴素贝叶斯分类器底层实现(python)
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- #简书对代码不友好!!! # Example of Naive Bayes implemented from Sc...
- opencv的机器学习检测在ML库中具有很大的相似性。简单来说都可以分成两步:1、训练/得到分类器。2、使用分类器...
- pip3 install -ihttps://pypi.douban.com/simplepandas (可以快速...
- 班训营已经来到倒数第2次课了,但为什么我的lesson 4的作业迟迟没交,因为感觉自己跟不上大部队的步伐。 首先自...