```python
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
import pandas as pd
import jieba
def cut(text):
"""
分隔标题为单个词语 如 '我爱北京天安门' => ['我', '爱', '北京', '天安', '天安门'];
:param text:标题
:return: 空格分隔的列表,列表里面是各种词语
"""
return ' '.join(list(jieba.cut(text,cut_all=True)))
def message_classification():
# 本地读取数据集,并构造target集 和 data集
ad = pd.read_csv('广告.csv')
target = ['广告']*len(ad['标题'])
kaoyan = pd.read_csv('考研.csv')
target=target+['考研']*len(kaoyan['标题'])
ad = ad['标题'].to_list()
kaoyan = kaoyan['标题'].to_list()
data = []
for text in ad:
data.append(cut(text))
for text in kaoyan:
data.append(cut(text))
# print(data[0:10])
# print(cut('我爱北京天安门'))
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=33)
#特征工程 文本抽取
transfer = TfidfVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 朴素贝叶斯算法预估
estimator = MultinomialNB()
# 训练模型机
#添加3交叉验证
estimator = GridSearchCV(estimator, param_grid = {}, cv=3)
estimator.fit(x_train, y_train)
# 模型评估
# 1) 直接对比真实值和预测值
y_predict = estimator.predict(x_test)
# 计算准确率
score = estimator.score(x_test, y_test)
print("准确率: ", score)
# 计算综合值
score = estimator.best_score_
print("综合值: ", score)
# 计算召回率
recall = recall_score(y_test, y_predict, average='weighted')
print("召回率: ", recall)
return None
if __name__ == '__main__':
message_classification()
```
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LOVEWE~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.345 seconds.
Prefix dict has been built succesfully.
准确率: 0.8650519031141869
综合值: 0.8403288619645175
召回率: 0.8650519031141869