导入jieba库提取字词特征,使用分类模型解决多新闻分类

导入jieba库提取字词特征,使用xgboost模型解决多新闻分类

新闻总体分成了8类:结构大致如下:


news.png

导入训练、测试数据

数据下载

train = pd.read_csv("true_train.csv",encoding="utf-8",sep="\t",header=None)
tset = pd.read_csv("true_test.csv",encoding="utf-8",sep="\t",header=None)
train=train.iloc[:,1:].copy()
test=test.iloc[:,1:].copy()

将一段话切开
jieba教程点这里

def get_words(n,train):
    ttt = train.iloc[n-1:n,:]
    #list(set(train[0]))
    t = ttt[1].values
    t =list(t)[0]
    seg_list = jieba.cut(t,cut_all=True)#全模式
    #seg_list = jieba.cut_for_search(t) #搜索引擎模式s
    z= list(seg_list)
    z= list(set(z))
    return z
def get_words_line(n,train):
    ttt = train.iloc[n-1:n,:]
    #list(set(train[0]))
    t = ttt[1].values
    t =list(t)[0]
    return str(t)

将字典转化为列表:

def dict2list(dic:dict):
    ''' 将字典转化为列表 '''
    keys = dic.keys()
    vals = dic.values()
    lst = [(key, val) for key, val in zip(keys, vals)]
    return lst

数据处理:

class line_word():
    kind_list = ['体育', '汽车', '军事', '科技', '娱乐', '财经', '旅游', '社会']
    def __init__(self,data):
        self.data =data
    def get_1(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[0]]
        t1.index=[x for x in range(len(t1))]
        kong1=[]
        for i in range(1,len(t1)+1):
            kong1.extend(get_words(i,t1))
        kong11 = Counter(kong1)
        return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
    def get_11(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[0]]
        t1.index=[x for x in range(len(t1))]
        kong1=""
        kong11=[]
        for i in range(1,len(t1)+1):
            d = get_words_line(i,t1)
            kong1 = kong1+d
            kong11.append(int(len(d)))
        return len(kong1)/len(t1),kong11

    def get_2(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[1]]
        t1.index=[x for x in range(len(t1))]
        kong1=[]
        for i in range(1,len(t1)+1):
            kong1.extend(get_words(i,t1))
        kong11 = Counter(kong1)
        return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
    def get_22(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[1]]
        t1.index=[x for x in range(len(t1))]
        kong1=""
        kong11=[]
        for i in range(1,len(t1)+1):
            d = get_words_line(i,t1)
            kong1 = kong1+d
            kong11.append(int(len(d)))
        return len(kong1)/len(t1),kong11
    
    
    
    def get_3(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[2]]
        t1.index=[x for x in range(len(t1))]
        kong1=[]
        for i in range(1,len(t1)+1):
            kong1.extend(get_words(i,t1))
        kong11 = Counter(kong1)
        return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
    def get_33(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[2]]
        t1.index=[x for x in range(len(t1))]
        kong1=""
        kong11=[]
        for i in range(1,len(t1)+1):
            d = get_words_line(i,t1)
            kong1 = kong1+d
            kong11.append(int(len(d)))
        return len(kong1)/len(t1),kong11
    
    
    def get_4(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[3]]
        t1.index=[x for x in range(len(t1))]
        kong1=[]
        for i in range(1,len(t1)+1):
            kong1.extend(get_words(i,t1))
        kong11 = Counter(kong1)
        return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
    def get_44(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[3]]
        t1.index=[x for x in range(len(t1))]
        kong1=""
        kong11=[]
        for i in range(1,len(t1)+1):
            d = get_words_line(i,t1)
            kong1 = kong1+d
            kong11.append(int(len(d)))
        return len(kong1)/len(t1),kong11
    
    
    def get_5(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[4]]
        t1.index=[x for x in range(len(t1))]
        kong1=[]
        for i in range(1,len(t1)+1):
            kong1.extend(get_words(i,t1))
        kong11 = Counter(kong1)
        return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
    def get_55(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[4]]
        t1.index=[x for x in range(len(t1))]
        kong1=""
        kong11=[]
        for i in range(1,len(t1)+1):
            d = get_words_line(i,t1)
            kong1 = kong1+d
            kong11.append(int(len(d)))
        return len(kong1)/len(t1),kong11
    
    
    def get_6(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[5]]
        t1.index=[x for x in range(len(t1))]
        kong1=[]
        for i in range(1,len(t1)+1):
            kong1.extend(get_words(i,t1))
        kong11 = Counter(kong1)
        return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
    def get_66(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[5]]
        t1.index=[x for x in range(len(t1))]
        kong1=""
        kong11=[]
        for i in range(1,len(t1)+1):
            d = get_words_line(i,t1)
            kong1 = kong1+d
            kong11.append(int(len(d)))
        return len(kong1)/len(t1),kong11
    
    
    def get_7(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[6]]
        t1.index=[x for x in range(len(t1))]
        kong1=[]
        for i in range(1,len(t1)+1):
            kong1.extend(get_words(i,t1))
        kong11 = Counter(kong1)
        return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
    def get_77(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[6]]
        t1.index=[x for x in range(len(t1))]
        kong1=""
        kong11=[]
        for i in range(1,len(t1)+1):
            d = get_words_line(i,t1)
            kong1 = kong1+d
            kong11.append(int(len(d)))
        return len(kong1)/len(t1),kong11
    
    
    def get_8(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[7]]
        t1.index=[x for x in range(len(t1))]
        kong1=[]
        for i in range(1,len(t1)+1):
            kong1.extend(get_words(i,t1))
        kong11 = Counter(kong1)
        return sorted(dict2list(kong11), key=lambda x:x[1], reverse=True)
    def get_88(self):
        t1 = self.data.loc[self.data[0]==line_word.kind_list[7]]
        t1.index=[x for x in range(len(t1))]
        kong1=""
        kong11=[]
        for i in range(1,len(t1)+1):
            d = get_words_line(i,t1)
            kong1 = kong1+d
            kong11.append(int(len(d)))
        return len(kong1)/len(t1),kong11

得到数据:

def deal_1(dic):
    num=0
    for i in range(len(dic)):#.remove(dic[0])
        if len(dic[i-num][0])<=1:
            dic.remove(dic[i-num])
            num+=1
    return dic
def give_power(f1,f2,f3,f4,f5,f6,f7,f8):
    f_list1= [f1,f2,f3,f4,f5,f6,f7,f8]
    #f_list =[f1%(f1-1),f2%(f2-1),f3%(f3-1),f4%(f4-1),f5%(f5-1),f6%(f6-1),f7%(f7-1),f8%(f8-1)]
    f_np = np.array(f_list1,dtype=np.float64)
    #f_sum = f_np.sum()
    #for i in range(8):
     #   f_np[i] =  f_np[i]/f_sum 
    return f_np
    
    
def get_word_power():
    dic1 = deal_1(line_word(train).get_1())
    dic2 = deal_1(line_word(train).get_2())
    dic3 = deal_1(line_word(train).get_3())
    dic4 = deal_1(line_word(train).get_4())
    dic5 = deal_1(line_word(train).get_5())
    dic6 = deal_1(line_word(train).get_6())
    dic7 = deal_1(line_word(train).get_7())
    dic8 = deal_1(line_word(train).get_8())
    dic1_key = [dic1[x][0] for x in range(len(dic1))]
    dic1_values = [dic1[x][1] for x in range(len(dic1))]  
    dic2_key = [dic2[x][0] for x in range(len(dic2))]
    dic2_values = [dic2[x][1] for x in range(len(dic2))]
    dic3_key = [dic3[x][0] for x in range(len(dic3))]
    dic3_values = [dic3[x][1] for x in range(len(dic3))]
    dic4_key = [dic4[x][0] for x in range(len(dic4))]
    dic4_values = [dic4[x][1] for x in range(len(dic4))]
    dic5_key = [dic5[x][0] for x in range(len(dic5))]
    dic5_values = [dic5[x][1] for x in range(len(dic5))]
    dic6_key = [dic6[x][0] for x in range(len(dic6))]
    dic6_values = [dic6[x][1] for x in range(len(dic6))]
    dic7_key = [dic7[x][0] for x in range(len(dic7))]
    dic7_values = [dic7[x][1] for x in range(len(dic7))]
    dic8_key = [dic8[x][0] for x in range(len(dic8))]
    dic8_values = [dic8[x][1] for x in range(len(dic8))]
    all_key =[]
    all_values =[]
    all_kind =[]
    all_key.extend(dic1_key)
    all_key.extend(dic2_key)
    all_key.extend(dic3_key)
    all_key.extend(dic4_key)
    all_key.extend(dic5_key)
    all_key.extend(dic6_key)
    all_key.extend(dic7_key)
    all_key.extend(dic8_key)
    all_values.extend(dic1_values)
    all_values.extend(dic2_values)
    all_values.extend(dic3_values)
    all_values.extend(dic4_values)
    all_values.extend(dic5_values)
    all_values.extend(dic6_values)
    all_values.extend(dic7_values)
    all_values.extend(dic8_values)
    dic1_kind = [0 for x in range(len(dic1))]
    dic2_kind = [1 for x in range(len(dic2))]
    dic3_kind = [2 for x in range(len(dic3))]
    dic4_kind = [3 for x in range(len(dic4))]
    dic5_kind = [4 for x in range(len(dic5))]
    dic6_kind = [5 for x in range(len(dic6))]
    dic7_kind = [6 for x in range(len(dic7))]
    dic8_kind = [7 for x in range(len(dic8))]
    all_kind.extend(dic1_kind)
    all_kind.extend(dic2_kind)
    all_kind.extend(dic3_kind)
    all_kind.extend(dic4_kind)
    all_kind.extend(dic5_kind)
    all_kind.extend(dic6_kind)
    all_kind.extend(dic7_kind)
    all_kind.extend(dic8_kind)
    all_word_list = pd.DataFrame({"words": all_key,"values": all_values,"kind": all_kind})

    return all_word_list
    
def hanshu_1(df):
    df1 = df.iloc[:,:8].copy()
    #df1 = df.values[:3].copy()
    df1 = np.array(df1,dtype=np.float32)
    #print(df1)
    
    return np.var(df1)
def hanshu_2(df):
    df1 = df.iloc[:,12:].copy()
    #df1 = df.values[:3].copy()
    df1 = np.array(df1,dtype=np.float32)
    #print(df1)
    
    return np.var(df1)
    
def get_word_Data():
    all_word_list1 = get_word_power()
    t =all_word_list1.groupby(["words","kind"],as_index=True).sum().unstack()
    t_VALES = t.values
    t_index = t.index
    all_all_all = pd.DataFrame(t_VALES)
    all_all_all["words"] = t_index
    all_all_all = all_all_all.fillna(0)
    all_all_all["sum_values"] = all_all_all.iloc[:,:8].sum(1)
    all_all_all["mean_values"] = all_all_all.iloc[:,:8].mean(1)
    all_all_all.sort_values(["sum_values"],ascending=False,inplace=True)
    t_var = all_all_all.groupby(["words"]).apply(hanshu_1)
    all_all_all = pd.merge(all_all_all,pd.DataFrame({"words": t_var.index.tolist(),"var": t_var.tolist()}),how="left",on=["words"])
    all_all_all.sort_values(["var"],ascending=False,inplace=True)
    for i in range(8):
        all_all_all[line_word.kind_list[i]+str("比例")] = all_all_all[i]/all_all_all["sum_values"]
    g_var = all_all_all.groupby(["words"]).apply(hanshu_2)
    all_all_all = pd.merge(all_all_all,pd.DataFrame({"words": g_var.index.tolist(),"p_var": g_var.tolist()}),how="left",on=["words"])
    all_all_all.sort_values(["var"],ascending=False,inplace=True)
    all_all_all["var*p"] = all_all_all["var"] * all_all_all.p_var
    all_all_all.index =[x for x in range(len(all_all_all))]
    return all_all_all
all_all_all = get_word_Data()
word_power.png

得到每个词语的特征

提取特征

得到每篇信息的前30个出现频率最高的词的特征:8各类的数量以及比例的。数量的方差,比例的方差等等。

class analysis_text_one():
    def __init__(self,data,i,all_data):
        self.data = data.iloc[i:i+1,:][1].values[0]
        self.target = data.iloc[i:i+1,:][0].values[0]
        self.all_all_all = all_data
    def deal_line(self):
        seg_list = jieba.cut(self.data,cut_all=True)#全模式
        z= list(seg_list)
        zz = Counter(z)
        zzz = sorted(dict2list(zz), key=lambda x:x[1], reverse=True)
        zzzz = deal_1(zzz)
        zzzzz = zzzz[:30]
        ttt = zzzzz[0][0] 
        gg = self.all_all_all.loc[self.all_all_all["words"]==zzzzz[0][0]].loc[:,self.all_all_all.columns[10:].tolist()]
        gg_v = gg.values
        for i in range(1,30):
            if i<len(zzzz):
                ttt = zzzzz[i][0]
                gg2 = self.all_all_all.loc[self.all_all_all["words"]==ttt].loc[:,self.all_all_all.columns[10:].tolist()]
                gg_v = np.row_stack((gg_v,gg2.values))
            else:
                gg_v = np.row_stack((gg_v,np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])))
        gg_v = gg_v.flatten()
        return gg_v
class analysis_text_test_one():
    def __init__(self,data,i,all_data):
        self.data = data.iloc[i:i+1,:][1].values[0]
        self.target = data.iloc[i:i+1,:][0].values[0]
        self.all_all_all = all_data
    def deal_line(self):
        seg_list = jieba.cut(self.data,cut_all=True)#全模式
        z= list(seg_list)
        zz = Counter(z)
        zzz = sorted(dict2list(zz), key=lambda x:x[1], reverse=True)
        zzzz = deal_1(zzz)
        zzzzz = zzzz[:30]
        #print(zzzzz)
        ttt = zzzzz[0][0] 
        gg = self.all_all_all.loc[self.all_all_all["words"]==zzzzz[0][0]].loc[:,self.all_all_all.columns[10:].tolist()]
        gg_v = gg.values
        for i in range(1,30):
            if i<len(zzzz):
                ttt = zzzzz[i][0]
                gg2 = self.all_all_all.loc[self.all_all_all["words"]==ttt].loc[:,self.all_all_all.columns[10:].tolist()]
                if len(gg2.values)==0:
                    gg_v = np.row_stack((gg_v,np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])))
                else:
                    gg_v = np.row_stack((gg_v,gg2.values))
            else:
                gg_v = np.row_stack((gg_v,np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])))
        n=0
        if len(gg_v.flatten())!=360:
            n = (360-len(gg_v.flatten()))/12
            print(n)
        for k in range(int(n)):
            gg_v = np.row_stack((gg_v,np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])))
        gg_v = gg_v.flatten()
        return gg_v
train_feature = analysis_text_one(train,0,all_all_all).deal_line()
for i in  range(1,len(train)):
    train_feature = np.row_stack((train_feature ,analysis_text_one(train,i,all_all_all).deal_line()))
train_lables = train_feature_all.iloc[:,360:]
train_lables = train_lables["target"].apply(lambda x : line_word.kind_list.index(x)).values
train_feature = train_feature.iloc[:,:360].values
test_feature = analysis_text_test_one(test,0,all_all_all).deal_line()

for i in  range(1,len(test)):
    #test_feature = analysis_text_test_one(test,i,all_all_all).deal_line()
    test_feature = np.row_stack((test_feature ,analysis_text_test_one(test,i,all_all_all).deal_line()))
    if i%50==1:
        print(i,len(test_feature))
test_feature

使用xgboost模型分类

#-*- coding=utf-8 -*-
import pandas as pd
import numpy as np
import datetime
import xgboost as xgb
from sklearn.cross_validation import train_test_split
import os
params={
'booster':'gbtree',
# 这里手写数字是0-9,是一个多类的问题,因此采用了multisoft多分类器,
'objective': 'multi:softmax', 
'num_class':8, # 类数,与 multisoftmax 并用
'gamma':0.05,  # 在树的叶子节点下一个分区的最小损失,越大算法模型越保守 。[0:]
'max_depth':12, # 构建树的深度 [1:]
#'lambda':450,  # L2 正则项权重
'subsample':0.4, # 采样训练数据,设置为0.5,随机选择一般的数据实例 (0:1]
'colsample_bytree':0.7, # 构建树树时的采样比率 (0:1]
#'min_child_weight':12, # 节点的最少特征数
'silent':1 ,
'eta': 0.01, # 如同学习率
'seed':710,
'nthread':4,# cpu 线程数,根据自己U的个数适当调整
}

plst = list(params.items())

#Using 10000 rows for early stopping. 
offset1 = 8000  # 训练集中数据12000,划分10000用作训练,2000用作验证
offset2 = 10000
num_rounds = 368 # 迭代次数
xgtest = xgb.DMatrix(train_feature[offset2:])
# 划分训练集与验证集 
xgtrain = xgb.DMatrix(train_feature[:offset1,:], label=train_lables[:offset1])
xgval = xgb.DMatrix(train_feature[offset1:offset2,:], label=train_lables[offset1:offset2])
# return 训练和验证的错误率
watchlist = [(xgtrain, 'train'),(xgval, 'val')]
# training model 
# early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(plst, xgtrain, num_rounds, watchlist,early_stopping_rounds=100)
#model.save_model('./model/xgb.model') # 用于存储训练出的模型
preds = model.predict(xgtest,ntree_limit=model.best_iteration)
print(preds)

最终大概准确率只有87%左右这并不是个很好的结果。等下我们打算只用TF-IDF模型解决信息问题。

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 194,761评论 5 460
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 81,953评论 2 371
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 141,998评论 0 320
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 52,248评论 1 263
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 61,130评论 4 356
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 46,145评论 1 272
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 36,550评论 3 381
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 35,236评论 0 253
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 39,510评论 1 291
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 34,601评论 2 310
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 36,376评论 1 326
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 32,247评论 3 313
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 37,613评论 3 299
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 28,911评论 0 17
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 30,191评论 1 250
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 41,532评论 2 342
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 40,739评论 2 335

推荐阅读更多精彩内容