TF-IDF和词袋表示文档特征实现文本分类

词袋模型文本分类

# -*- coding: utf-8 -*-

import numpyas np

import pandasas pd

import jieba

from sklearn.feature_extraction.textimport TfidfVectorizer

from sklearn.preprocessingimport LabelEncoder

from sklearn.model_selectionimport train_test_split

from sklearn.linear_modelimport LogisticRegression

from sklearn.metricsimport confusion_matrix, precision_recall_fscore_support

import dill

import pickle

加载停用词

def stop_words():
     stopwords = set()
     with open('../data/stopwords.txt', 'r',encoding='utf8') as infile:
     for line in infile:
             line = line.strip('\n') if line:
             stopwords.add(line.lower())
     return stopwords

训练模型

def get_tf_idf():

    # 常用参数说明

# penalty: 正则项类型，l1还是l2

# C: 正则项惩罚系数的倒数，越大则惩罚越小

# fit_intercept: 是否拟合常数项

# max_iter: 最大迭代次数

# multi_class: 以何种方式训练多分类模型

# ovr = 对每个标签训练二分类模型

# multinomial = 直接训练多分类模型，仅当solver={newton-cg, sag, lbfgs}时支持

# solver: 用哪种方法求解，可选有{liblinear, newton-cg, sag, lbfgs}

# 小数据liblinear比较好，大数据量sag更快

# 多分类问题，liblinear只支持ovr模式，其他支持ovr和multinomial

# liblinear支持l1正则，其他只支持l2正则

stopwords=stop_words()

    train_data = pd.read_csv('../data/sohu_train.txt',sep='\t',header=None,dtype=np.str_,encoding='utf8',

names=['频道','文章'])

    tfidf = TfidfVectorizer(tokenizer=jieba.lcut,stop_words=stopwords,min_df=50,max_df=0.3)

    x = tfidf.fit_transform(train_data['文章'])

    y_encoder = LabelEncoder()

    y = y_encoder.fit_transform(train_data['频道'])

    train_idx, test_idx = train_test_split(range(len(y)),test_size=0.2,stratify=y)

    train_x = x[train_idx, :]

    train_y = y[train_idx]

    test_x = x[test_idx, :]

    test_y = y[test_idx]

    model = LogisticRegression(multi_class='multinomial',solver='lbfgs')

    model.fit(train_x, train_y)

    test_y_pred = model.predict(test_x)

# 保存模型

with open('../model_path/tf_idf_model.pkl','wb')as outfile:

    pickle.dump({

    'y_encoder': y_encoder,

    'tfidf': tfidf,

    'lr': model

    },outfile)

    eval_model(test_y, test_y_pred, y_encoder.classes_)

计算各项评价指标

def eval_model(y_true, y_pred, labels):

# 计算每个分类的Precision, Recall, f1, support

p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)

# 计算总体的平均Precision, Recall, f1, support

tot_p = np.average(p,weights=s)

    tot_r = np.average(r,weights=s)

    tot_f1 = np.average(f1,weights=s)

    tot_s = np.sum(s)

    res1 = pd.DataFrame({

    u'Label': labels,

    u'Precision': p,

    u'Recall': r,

    u'F1': f1,

    u'Support': s

    })

    res2 = pd.DataFrame({

    u'Label': [u'总体'],

    u'Precision': [tot_p],

    u'Recall': [tot_r],

    u'F1': [tot_f1],

    u'Support': [tot_s]

    })

    res2.index = [999]

    res = pd.concat([res1, res2])

    return res[[u'Label',u'Precision',u'Recall',u'F1',u'Support']]

对新数据进行预测

def prdition(model_file,new_data):

    import pickle

    with open(model_file,'rb')as infile:

        model = pickle.load(infile)

new_x = model['tfidf'].transform(new_data[u'文章'][:10])

    new_y_pred = model['lr'].predict(new_x)

    return new_y_pred

if __name__ =='__main__':

    get_tf_idf()

最后编辑于：2019.12.23 22:48:42