113-文本分析之有监督分类

1、二分类

p_load(fastrtext, tidyfst)

data("train_sentences")
data("test_sentences")

# 只提取(“AIMX”和“CONT”)两类
train_raw <- train_sentences %>% 
  as_tibble() %>% 
  filter(class.text %in% c("AIMX", "CONT"))

test_raw <- test_sentences %>% 
  as_tibble() %>% 
  filter(class.text %in% c("AIMX", "CONT"))

table(train_raw$class.text)
## 
## AIMX CONT 
##  149  144
# 预处理,分词、去除停止词,计算tf-idf
p_load(tidytext)

sel_word <- train_raw %>% 
  # 分词
  unnest_tokens(word, text) %>% 
  # 去除停用词
  anti_join(stop_words) %>% 
  group_by(class.text) %>% 
  count(word) %>% 
  ungroup() %>% 
  bind_tf_idf(word, class.text, n) %>% 
  distinct(word, .keep_all = T) %>% 
  # 前100
  top_n(100, tf_idf) %>% 
  select(word, tf_idf)

sel_word
## # A tibble: 107 × 2
##    word           tf_idf
##    <chr>           <dbl>
##  1 adopted       0.00210
##  2 algebraic     0.00210
##  3 anchors       0.00140
##  4 answer        0.00175
##  5 ases          0.00210
##  6 attempt       0.00210
##  7 avoid         0.00175
##  8 balanced      0.00210
##  9 bernoulli     0.00210
## 10 circumstances 0.00210
## # … with 97 more rows

因为分值相同,所以最终结果多余100个。

# 重新构造训练集和测试集
train1 <- train_raw %>% 
  # 为每一个文本单独编号
  mutate(id = 1:n()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>%
  # 筛选目标词
  inner_join(sel_word) %>% 
  # 去除句内重复
  distinct(id, word, .keep_all = T) %>% 
  # 长表转宽表
  wider_dt(name = "word",
           value = "tf_idf",
           fill = 0)

# 有可能有的文档完全没有目标词,所以需要补充,并标记为0
train <- train_raw %>% 
  mutate(id = 1:n()) %>% 
  select(id, class.text) %>% 
  left_join(train1) %>% 
  # 缺失值插入0
  replace_na_dt(to = 0) %>% 
  select(-id)

# 测试集进行同样的操作
test1 <- test_raw %>% 
  # 为每一个文本单独编号
  mutate(id = 1:n()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>%
  # 筛选目标词
  inner_join(sel_word) %>% 
  # 去除句内重复
  distinct(id, word, .keep_all = T) %>% 
  # 长表转宽表
  wider_dt(name = "word",
           value = "tf_idf",
           fill = 0)

# 有可能有的文档完全没有目标词,所以需要补充,并标记为0
test2 <- test_raw %>% 
  mutate(id = 1:n()) %>% 
  select(id, class.text) %>% 
  left_join(test1) %>% 
  # 缺失值插入0
  replace_na_dt(to = 0) %>% 
  select(-id)

# 数据框需要补齐到与训练集长度一致,所以需要补全所有单词,并标记为0
# 获取需要补全的单词
to_add <- setdiff(names(train), names(test2))

# 一定要有小括号,表示向量
test <- test2[, (to_add) := 0]
# 检查长度是否一致
length(train) == length(test)
## [1] TRUE
# 检查两列名称是否一致
setequal(names(train), names(test))
## [1] TRUE
# 响应变量转化为因子型,否则很多机器学习模型会误认为是回归问题
train <- train %>% 
  mutate_dt(class.text = as.factor(class.text))

test <- test %>% 
  mutate_dt(class.text = as.factor(class.text))
# 建模分析与评估
train_model <- glm(class.text ~ ., data = train,
                   family = "binomial")
summary(train_model)
## 
## Call:
## glm(formula = class.text ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.62589  -0.00003   0.00000   0.00004   0.78760  
## 
## Coefficients: (13 not defined because of singularities)
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.012e+00  2.919e-01   3.465  0.00053 ***
## adopted        -1.111e+03  6.440e+06   0.000  0.99986    
## al              9.213e+03  6.783e+06   0.001  0.99892    
## algebraic      -1.027e+04  6.089e+06  -0.002  0.99865    
## anchors         1.443e+04  2.449e+07   0.001  0.99953    
## answer          1.306e+04  2.365e+07   0.001  0.99956    
## ases           -9.862e+03  5.586e+06  -0.002  0.99859    
## aspects         1.086e+04  6.838e+06   0.002  0.99873    
## attempt         9.617e+03  1.633e+07   0.001  0.99953    
## avoid          -1.333e+03  1.234e+07   0.000  0.99991    
## balanced       -2.050e+04  2.912e+07  -0.001  0.99944    
## bernoulli      -1.073e+04  5.670e+06  -0.002  0.99849    
## biological      1.149e+04  4.592e+07   0.000  0.99980    
## bounds         -3.674e+03  4.995e+07   0.000  0.99994    
## cdna           -1.073e+03  1.901e+07   0.000  0.99995    
## cellular       -3.655e+02  9.442e+06   0.000  0.99997    
## choices         6.010e+03  5.164e+06   0.001  0.99907    
## circumstances   1.217e+04  3.864e+07   0.000  0.99975    
## clinical        1.352e+04  7.915e+07   0.000  0.99986    
## computer       -1.514e+04  7.555e+06  -0.002  0.99840    
## conditions     -2.507e+01  2.147e+07   0.000  1.00000    
## continuous     -6.085e+03  3.183e+06  -0.002  0.99847    
## control        -1.609e+04  2.690e+07  -0.001  0.99952    
## current        -1.077e+04  2.630e+07   0.000  0.99967    
## describe       -1.609e+04  2.083e+07  -0.001  0.99938    
## designed       -1.077e+04  2.197e+07   0.000  0.99961    
## difficult       3.709e+03  2.250e+06   0.002  0.99868    
## discuss        -1.566e+04  1.092e+07  -0.001  0.99886    
## drawn                  NA         NA      NA       NA    
## easy            1.357e+04  1.206e+07   0.001  0.99910    
## empirical       1.369e+04  9.249e+06   0.001  0.99882    
## equal           1.144e+04  9.394e+06   0.001  0.99903    
## evs            -2.972e+04  4.291e+07  -0.001  0.99945    
## examine        -1.216e+04  6.315e+06  -0.002  0.99846    
## existing       -1.482e+04  8.713e+07   0.000  0.99986    
## expected       -5.598e+03  1.838e+07   0.000  0.99976    
## experience     -8.787e-08  1.944e+07   0.000  1.00000    
## extends         1.168e+04  4.297e+07   0.000  0.99978    
## extensive      -1.135e+03  3.134e+07   0.000  0.99997    
## fixed           1.449e+04  4.484e+07   0.000  0.99974    
## generalization -2.215e+03  8.198e+07   0.000  0.99998    
## generation     -8.624e+02  1.003e+07   0.000  0.99993    
## genome         -1.481e+04  8.492e+06  -0.002  0.99861    
## implies                NA         NA      NA       NA    
## independently   1.505e+04  3.466e+07   0.000  0.99965    
## indirect        1.368e+04  9.175e+06   0.001  0.99881    
## influence       4.287e+01  2.807e+07   0.000  1.00000    
## influences      1.029e+04  2.848e+07   0.000  0.99971    
## introduce      -6.291e+03  1.095e+07  -0.001  0.99954    
## iterations     -1.870e+02  1.374e+07   0.000  0.99999    
## judgments              NA         NA      NA       NA    
## libraries              NA         NA      NA       NA    
## library        -1.604e+04  2.946e+07  -0.001  0.99957    
## limitations     2.338e+02  3.617e+07   0.000  0.99999    
## limited         4.238e+03  2.048e+06   0.002  0.99835    
## means          -1.467e+04  4.186e+07   0.000  0.99972    
## measure         1.442e+04  2.410e+07   0.001  0.99952    
## minimization    1.154e+04  2.074e+07   0.001  0.99956    
## missing         1.392e+04  1.551e+07   0.001  0.99928    
## modified        1.494e+03  3.181e+07   0.000  0.99996    
## mouse           1.075e+03  2.722e+07   0.000  0.99997    
## network        -7.355e+03  3.956e+06  -0.002  0.99852    
## neuronal        4.767e+03  3.914e+06   0.001  0.99903    
## openness        9.574e+03  4.421e+07   0.000  0.99983    
## optimal         1.430e+04  1.174e+07   0.001  0.99903    
## paper          -1.342e+03  4.259e+05  -0.003  0.99749    
## parallel       -9.937e+03  6.048e+06  -0.002  0.99869    
## parameters      1.383e+04  5.279e+07   0.000  0.99979    
## parametric      1.144e+04  1.627e+07   0.001  0.99944    
## peptides       -8.782e+03  4.142e+06  -0.002  0.99831    
## personality    -1.537e+04  3.681e+07   0.000  0.99967    
## pie                    NA         NA      NA       NA    
## pml            -2.615e+04  4.132e+07  -0.001  0.99949    
## pro                    NA         NA      NA       NA    
## probability    -1.171e+04  4.055e+07   0.000  0.99977    
## processes              NA         NA      NA       NA    
## propose        -9.195e+03  5.952e+06  -0.002  0.99877    
## question       -9.195e+03  8.418e+06  -0.001  0.99913    
## radically       1.884e+04  2.282e+07   0.001  0.99934    
## range                  NA         NA      NA       NA    
## regularized            NA         NA      NA       NA    
## representative -2.824e+04  2.248e+07  -0.001  0.99900    
## require         3.128e+02  8.615e+06   0.000  0.99997    
## response        1.078e+04  6.368e+06   0.002  0.99865    
## risk           -1.073e+04  1.389e+07  -0.001  0.99938    
## sampling               NA         NA      NA       NA    
## sces           -1.616e+04  4.698e+07   0.000  0.99973    
## selection       4.287e+01  1.965e+07   0.000  1.00000    
## sequences      -4.554e+02  1.504e+07   0.000  0.99998    
## short          -5.361e+03  9.521e+07   0.000  0.99996    
## slightly       -4.269e+02  2.352e+07   0.000  0.99999    
## stability       1.701e+04  3.439e+07   0.000  0.99961    
## structure      -9.061e+03  5.052e+06  -0.002  0.99857    
## students       -1.306e+04  2.365e+07  -0.001  0.99956    
## substantial     1.133e+04  7.623e+06   0.001  0.99881    
## tailored       -2.747e+02  2.322e+07   0.000  0.99999    
## target          1.501e+03  2.632e+07   0.000  0.99995    
## taxonomy               NA         NA      NA       NA    
## theorem        -5.853e+02  9.615e+06   0.000  0.99995    
## therapy         7.685e+03  6.300e+06   0.001  0.99903    
## trait          -1.744e+04  1.837e+07  -0.001  0.99924    
## transfer       -1.287e+04  1.179e+07  -0.001  0.99913    
## type           -1.482e+04  8.563e+06  -0.002  0.99862    
## typically       1.105e+04  6.614e+06   0.002  0.99867    
## uncertainty    -5.476e-08  1.591e+07   0.000  1.00000    
## unlike          1.133e+04  1.548e+07   0.001  0.99942    
## weakening              NA         NA      NA       NA    
## widely                 NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 406.10  on 292  degrees of freedom
## Residual deviance:  69.59  on 198  degrees of freedom
## AIC: 259.59
## 
## Number of Fisher Scoring iterations: 20
obj <- pull(test, class.text)

# 对测试集进行预测
test_pre <- predict.glm(train_model, 
                        select(test, -class.text),
                        type = "response")

test_pre <- ifelse(test_pre >= 0.5, 
                   levels(obj)[2], levels(obj)[1]) %>% 
  as.factor()

# 混淆矩阵计算精确度、KAPPA值
caret::confusionMatrix(test_pre, obj)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction AIMX CONT
##       AIMX   35    2
##       CONT    6   22
##                                           
##                Accuracy : 0.8769          
##                  95% CI : (0.7718, 0.9453)
##     No Information Rate : 0.6308          
##     P-Value [Acc > NIR] : 8.823e-06       
##                                           
##                   Kappa : 0.7446          
##                                           
##  Mcnemar's Test P-Value : 0.2888          
##                                           
##             Sensitivity : 0.8537          
##             Specificity : 0.9167          
##          Pos Pred Value : 0.9459          
##          Neg Pred Value : 0.7857          
##              Prevalence : 0.6308          
##          Detection Rate : 0.5385          
##    Detection Prevalence : 0.5692          
##       Balanced Accuracy : 0.8852          
##                                           
##        'Positive' Class : AIMX
# ROC曲线
p_load(ROCit)

# 需要将因子变量转换为数值型
obj_roc <- rocit(score = as.numeric(obj), 
                 class = as.numeric(test_pre))

summary(obj_roc)
## Method used: empirical               
## Number of positive(s): 28                
## Number of negative(s): 37                
## Area under curve: 0.8658
plot(obj_roc, legend = F, YIndex = F)
ROC曲线

图中虚线表示基准值,如果实线在虚线之下,说明模型效果不如随机猜测有效。

2、多分类

与二分类类似,标签多于两个,算法包括决策树、朴素贝叶斯、支持向量机等,而决策树又包括C4.5、CART、C5.0方法,本例使用CART方法。

train_raw <- train_sentences %>% 
  as_tibble() %>% 
  filter(class.text %in% c("AIMX", "CONT", "BASE"))

test_raw <- test_sentences %>% 
  as_tibble() %>% 
  filter(class.text %in% c("AIMX", "CONT", "BASE"))

table(train_raw$class.text)
## 
## AIMX BASE CONT 
##  149   48  144
sel_word <- train_raw %>% 
  # 分词
  unnest_tokens(word, text) %>% 
  # 去除停用词
  anti_join(stop_words) %>% 
  group_by(class.text) %>% 
  count(word) %>% 
  ungroup() %>% 
  bind_tf_idf(word, class.text, n) %>% 
  distinct(word, .keep_all = T) %>% 
  # 前100
  top_n(100, tf_idf) %>% 
  select(word, tf_idf)

sel_word
## # A tibble: 127 × 2
##    word           tf_idf
##    <chr>           <dbl>
##  1 adopted       0.00334
##  2 algebraic     0.00334
##  3 attempt       0.00334
##  4 balanced      0.00334
##  5 bernoulli     0.00334
##  6 circumstances 0.00334
##  7 continuous    0.00556
##  8 drawn         0.00334
##  9 experience    0.00389
## 10 implies       0.00334
## # … with 117 more rows
# 重新构造训练集和测试集
train1 <- train_raw %>% 
  # 为每一个文本单独编号
  mutate(id = 1:n()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>%
  # 筛选目标词
  inner_join(sel_word) %>% 
  # 去除句内重复
  distinct(id, word, .keep_all = T) %>% 
  # 长表转宽表
  wider_dt(name = "word",
           value = "tf_idf",
           fill = 0)

# 有可能有的文档完全没有目标词,所以需要补充,并标记为0
train <- train_raw %>% 
  mutate(id = 1:n()) %>% 
  select(id, class.text) %>% 
  left_join(train1) %>% 
  # 缺失值插入0
  replace_na_dt(to = 0) %>% 
  select(-id)

# 测试集进行同样的操作
test1 <- test_raw %>% 
  # 为每一个文本单独编号
  mutate(id = 1:n()) %>% 
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>%
  # 筛选目标词
  inner_join(sel_word) %>% 
  # 去除句内重复
  distinct(id, word, .keep_all = T) %>% 
  # 长表转宽表
  wider_dt(name = "word",
           value = "tf_idf",
           fill = 0)

# 有可能有的文档完全没有目标词,所以需要补充,并标记为0
test2 <- test_raw %>% 
  mutate(id = 1:n()) %>% 
  select(id, class.text) %>% 
  left_join(test1) %>% 
  # 缺失值插入0
  replace_na_dt(to = 0) %>% 
  select(-id)

# 数据框需要补齐到与训练集长度一致,所以需要补全所有单词,并标记为0
# 获取需要补全的单词
to_add <- setdiff(names(train), names(test2))

test <- test2[, (to_add) := 0]

# 检查长度是否一致
length(train) == length(test)
## [1] TRUE
# 检查两列名称是否一致
setequal(names(train), names(test))
## [1] TRUE
# 响应变量转化为因子型,否则很多机器学习模型会误认为是回归问题
train <- train %>% 
  mutate_dt(class.text = as.factor(class.text))

test <- test %>% 
  mutate_dt(class.text = as.factor(class.text))

# 建模分析与评估
p_load(rpart)
rpart_model <- rpart(class.text ~ ., data = train)
summary(rpart_model)
## Call:
## rpart(formula = class.text ~ ., data = train)
##   n= 341 
## 
##          CP nsplit rel error    xerror       xstd
## 1 0.1979167      0 1.0000000 1.0781250 0.04697417
## 2 0.0100000      1 0.8020833 0.8020833 0.04786331
## 
## Variable importance
##     paper   adopted   attempt     drawn   implies weakening 
##        60         8         8         8         8         8 
## 
## Node number 1: 341 observations,    complexity param=0.1979167
##   predicted class=AIMX  expected loss=0.5630499  P(node) =1
##     class counts:   149    48   144
##    probabilities: 0.437 0.141 0.422 
##   left son=2 (45 obs) right son=3 (296 obs)
##   Primary splits:
##       paper     < 0.00441169  to the right, improve=23.668390, (0 missing)
##       difficult < 0.00398667  to the left,  improve= 7.949241, (0 missing)
##       limited   < 0.003701908 to the left,  improve= 7.358934, (0 missing)
##       introduce < 0.002779889 to the right, improve= 5.307298, (0 missing)
##       neuronal  < 0.003417146 to the left,  improve= 5.033265, (0 missing)
##   Surrogate splits:
##       adopted   < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
##       attempt   < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
##       drawn     < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
##       implies   < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
##       weakening < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
## 
## Node number 2: 45 observations
##   predicted class=AIMX  expected loss=0.04444444  P(node) =0.1319648
##     class counts:    43     2     0
##    probabilities: 0.956 0.044 0.000 
## 
## Node number 3: 296 observations
##   predicted class=CONT  expected loss=0.5135135  P(node) =0.8680352
##     class counts:   106    46   144
##    probabilities: 0.358 0.155 0.486
# 测试集
obj <- pull(test, class.text)

# 对测试集进行预测
test_pre <- predict(rpart_model, test, type = "class") %>% 
  as.factor()

# 混淆矩阵计算精确度、KAPPA值
caret::confusionMatrix(test_pre, obj)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction AIMX BASE CONT
##       AIMX   15    0    0
##       BASE    0    0    0
##       CONT   26   13   24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.3846, 0.6154)
##     No Information Rate : 0.5256          
##     P-Value [Acc > NIR] : 0.7149          
##                                           
##                   Kappa : 0.2312          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: AIMX Class: BASE Class: CONT
## Sensitivity               0.3659      0.0000      1.0000
## Specificity               1.0000      1.0000      0.2778
## Pos Pred Value            1.0000         NaN      0.3810
## Neg Pred Value            0.5873      0.8333      1.0000
## Prevalence                0.5256      0.1667      0.3077
## Detection Rate            0.1923      0.0000      0.3077
## Detection Prevalence      0.1923      0.0000      0.8077
## Balanced Accuracy         0.6829      0.5000      0.6389
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 215,076评论 6 497
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 91,658评论 3 389
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 160,732评论 0 350
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 57,493评论 1 288
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 66,591评论 6 386
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 50,598评论 1 293
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,601评论 3 415
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,348评论 0 270
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,797评论 1 307
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,114评论 2 330
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,278评论 1 344
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,953评论 5 339
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,585评论 3 322
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,202评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,442评论 1 268
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,180评论 2 367
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,139评论 2 352

推荐阅读更多精彩内容