词袋模型文本分类
# -*- coding: utf-8 -*-
import numpyas np
import pandasas pd
import jieba
from sklearn.feature_extraction.textimport TfidfVectorizer
from sklearn.preprocessingimport LabelEncoder
from sklearn.model_selectionimport train_test_split
from sklearn.linear_modelimport LogisticRegression
from sklearn.metricsimport confusion_matrix, precision_recall_fscore_support
import dill
import pickle
加载停用词
def stop_words():
stopwords = set()
with open('../data/stopwords.txt', 'r',encoding='utf8') as infile:
for line in infile:
line = line.strip('\n') if line:
stopwords.add(line.lower())
return stopwords
训练模型
def get_tf_idf():
# 常用参数说明
# penalty: 正则项类型,l1还是l2
# C: 正则项惩罚系数的倒数,越大则惩罚越小
# fit_intercept: 是否拟合常数项
# max_iter: 最大迭代次数
# multi_class: 以何种方式训练多分类模型
# ovr = 对每个标签训练二分类模型
# multinomial = 直接训练多分类模型,仅当solver={newton-cg, sag, lbfgs}时支持
# solver: 用哪种方法求解,可选有{liblinear, newton-cg, sag, lbfgs}
# 小数据liblinear比较好,大数据量sag更快
# 多分类问题,liblinear只支持ovr模式,其他支持ovr和multinomial
# liblinear支持l1正则,其他只支持l2正则
stopwords=stop_words()
train_data = pd.read_csv('../data/sohu_train.txt',sep='\t',header=None,dtype=np.str_,encoding='utf8',
names=['频道','文章'])
tfidf = TfidfVectorizer(tokenizer=jieba.lcut,stop_words=stopwords,min_df=50,max_df=0.3)
x = tfidf.fit_transform(train_data['文章'])
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(train_data['频道'])
train_idx, test_idx = train_test_split(range(len(y)),test_size=0.2,stratify=y)
train_x = x[train_idx, :]
train_y = y[train_idx]
test_x = x[test_idx, :]
test_y = y[test_idx]
model = LogisticRegression(multi_class='multinomial',solver='lbfgs')
model.fit(train_x, train_y)
test_y_pred = model.predict(test_x)
# 保存模型
with open('../model_path/tf_idf_model.pkl','wb')as outfile:
pickle.dump({
'y_encoder': y_encoder,
'tfidf': tfidf,
'lr': model
},outfile)
eval_model(test_y, test_y_pred, y_encoder.classes_)
计算各项评价指标
def eval_model(y_true, y_pred, labels):
# 计算每个分类的Precision, Recall, f1, support
p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)
# 计算总体的平均Precision, Recall, f1, support
tot_p = np.average(p,weights=s)
tot_r = np.average(r,weights=s)
tot_f1 = np.average(f1,weights=s)
tot_s = np.sum(s)
res1 = pd.DataFrame({
u'Label': labels,
u'Precision': p,
u'Recall': r,
u'F1': f1,
u'Support': s
})
res2 = pd.DataFrame({
u'Label': [u'总体'],
u'Precision': [tot_p],
u'Recall': [tot_r],
u'F1': [tot_f1],
u'Support': [tot_s]
})
res2.index = [999]
res = pd.concat([res1, res2])
return res[[u'Label',u'Precision',u'Recall',u'F1',u'Support']]
对新数据进行预测
def prdition(model_file,new_data):
import pickle
with open(model_file,'rb')as infile:
model = pickle.load(infile)
new_x = model['tfidf'].transform(new_data[u'文章'][:10])
new_y_pred = model['lr'].predict(new_x)
return new_y_pred
if __name__ =='__main__':
get_tf_idf()