贝叶斯 检测垃圾邮件

对email进行二分类,两种邮件分别在ham和spam目录下

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 使用三种方式
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

# 2个路径
paths = ['../data/email/ham/', '../data/email/spam/']

email = []
target = []

for path in paths:
    for i in range(1,26):
        file_path = f'{path}{i}.txt'
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as fp:
            content = fp.read()
            email.append(content)
            
            target.append(path.split('/')[-2])
            

from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
tf_idf.fit(email)
data = tf_idf.transform(email).toarray()
data

# 高斯分布贝叶斯
ga_nb = GaussianNB()
ga_nb.fit(data, target).score(data, target)

# 多项式分布贝叶斯
mu_nb = MultinomialNB()
mu_nb.fit(data, target).score(data, target)

# 伯努力分布贝叶斯
be_nb = BernoulliNB()
be_nb.fit(data, target).score(data, target)

# 测试
msg = [
    'With Jose out of town, do you want to meet once in a while to keep things going and do some interesting s',
    'Ryan Whybrew commented on your status. Ryan wrote: "turd ferguson or butt horn.',
    'Amazing increase in thickness of yourPenis, up to 30% BetterEjacu1ation control',
    'Increase volume ofEjacu1ate Doctor designed and endorsed 100% herbal, 100% Natural, 100% Safe'
]
X_test = tf_idf.transform(msg).toarray()
ga_nb.predict(X_test)   
mu_nb.predict(X_test)
be_nb.predict(X_test)

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。