对email进行二分类,两种邮件分别在ham和spam目录下
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# 使用三种方式
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
# 2个路径
paths = ['../data/email/ham/', '../data/email/spam/']
email = []
target = []
for path in paths:
for i in range(1,26):
file_path = f'{path}{i}.txt'
with open(file_path, 'r', encoding='utf-8', errors='ignore') as fp:
content = fp.read()
email.append(content)
target.append(path.split('/')[-2])
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer()
tf_idf.fit(email)
data = tf_idf.transform(email).toarray()
data
# 高斯分布贝叶斯
ga_nb = GaussianNB()
ga_nb.fit(data, target).score(data, target)
# 多项式分布贝叶斯
mu_nb = MultinomialNB()
mu_nb.fit(data, target).score(data, target)
# 伯努力分布贝叶斯
be_nb = BernoulliNB()
be_nb.fit(data, target).score(data, target)
# 测试
msg = [
'With Jose out of town, do you want to meet once in a while to keep things going and do some interesting s',
'Ryan Whybrew commented on your status. Ryan wrote: "turd ferguson or butt horn.',
'Amazing increase in thickness of yourPenis, up to 30% BetterEjacu1ation control',
'Increase volume ofEjacu1ate Doctor designed and endorsed 100% herbal, 100% Natural, 100% Safe'
]
X_test = tf_idf.transform(msg).toarray()
ga_nb.predict(X_test)
mu_nb.predict(X_test)
be_nb.predict(X_test)