1、二分类
p_load(fastrtext, tidyfst)
data("train_sentences")
data("test_sentences")
# 只提取(“AIMX”和“CONT”)两类
train_raw <- train_sentences %>%
as_tibble() %>%
filter(class.text %in% c("AIMX", "CONT"))
test_raw <- test_sentences %>%
as_tibble() %>%
filter(class.text %in% c("AIMX", "CONT"))
table(train_raw$class.text)
##
## AIMX CONT
## 149 144
# 预处理,分词、去除停止词,计算tf-idf
p_load(tidytext)
sel_word <- train_raw %>%
# 分词
unnest_tokens(word, text) %>%
# 去除停用词
anti_join(stop_words) %>%
group_by(class.text) %>%
count(word) %>%
ungroup() %>%
bind_tf_idf(word, class.text, n) %>%
distinct(word, .keep_all = T) %>%
# 前100
top_n(100, tf_idf) %>%
select(word, tf_idf)
sel_word
## # A tibble: 107 × 2
## word tf_idf
## <chr> <dbl>
## 1 adopted 0.00210
## 2 algebraic 0.00210
## 3 anchors 0.00140
## 4 answer 0.00175
## 5 ases 0.00210
## 6 attempt 0.00210
## 7 avoid 0.00175
## 8 balanced 0.00210
## 9 bernoulli 0.00210
## 10 circumstances 0.00210
## # … with 97 more rows
因为分值相同,所以最终结果多余100个。
# 重新构造训练集和测试集
train1 <- train_raw %>%
# 为每一个文本单独编号
mutate(id = 1:n()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
# 筛选目标词
inner_join(sel_word) %>%
# 去除句内重复
distinct(id, word, .keep_all = T) %>%
# 长表转宽表
wider_dt(name = "word",
value = "tf_idf",
fill = 0)
# 有可能有的文档完全没有目标词,所以需要补充,并标记为0
train <- train_raw %>%
mutate(id = 1:n()) %>%
select(id, class.text) %>%
left_join(train1) %>%
# 缺失值插入0
replace_na_dt(to = 0) %>%
select(-id)
# 测试集进行同样的操作
test1 <- test_raw %>%
# 为每一个文本单独编号
mutate(id = 1:n()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
# 筛选目标词
inner_join(sel_word) %>%
# 去除句内重复
distinct(id, word, .keep_all = T) %>%
# 长表转宽表
wider_dt(name = "word",
value = "tf_idf",
fill = 0)
# 有可能有的文档完全没有目标词,所以需要补充,并标记为0
test2 <- test_raw %>%
mutate(id = 1:n()) %>%
select(id, class.text) %>%
left_join(test1) %>%
# 缺失值插入0
replace_na_dt(to = 0) %>%
select(-id)
# 数据框需要补齐到与训练集长度一致,所以需要补全所有单词,并标记为0
# 获取需要补全的单词
to_add <- setdiff(names(train), names(test2))
# 一定要有小括号,表示向量
test <- test2[, (to_add) := 0]
# 检查长度是否一致
length(train) == length(test)
## [1] TRUE
# 检查两列名称是否一致
setequal(names(train), names(test))
## [1] TRUE
# 响应变量转化为因子型,否则很多机器学习模型会误认为是回归问题
train <- train %>%
mutate_dt(class.text = as.factor(class.text))
test <- test %>%
mutate_dt(class.text = as.factor(class.text))
# 建模分析与评估
train_model <- glm(class.text ~ ., data = train,
family = "binomial")
summary(train_model)
##
## Call:
## glm(formula = class.text ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.62589 -0.00003 0.00000 0.00004 0.78760
##
## Coefficients: (13 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.012e+00 2.919e-01 3.465 0.00053 ***
## adopted -1.111e+03 6.440e+06 0.000 0.99986
## al 9.213e+03 6.783e+06 0.001 0.99892
## algebraic -1.027e+04 6.089e+06 -0.002 0.99865
## anchors 1.443e+04 2.449e+07 0.001 0.99953
## answer 1.306e+04 2.365e+07 0.001 0.99956
## ases -9.862e+03 5.586e+06 -0.002 0.99859
## aspects 1.086e+04 6.838e+06 0.002 0.99873
## attempt 9.617e+03 1.633e+07 0.001 0.99953
## avoid -1.333e+03 1.234e+07 0.000 0.99991
## balanced -2.050e+04 2.912e+07 -0.001 0.99944
## bernoulli -1.073e+04 5.670e+06 -0.002 0.99849
## biological 1.149e+04 4.592e+07 0.000 0.99980
## bounds -3.674e+03 4.995e+07 0.000 0.99994
## cdna -1.073e+03 1.901e+07 0.000 0.99995
## cellular -3.655e+02 9.442e+06 0.000 0.99997
## choices 6.010e+03 5.164e+06 0.001 0.99907
## circumstances 1.217e+04 3.864e+07 0.000 0.99975
## clinical 1.352e+04 7.915e+07 0.000 0.99986
## computer -1.514e+04 7.555e+06 -0.002 0.99840
## conditions -2.507e+01 2.147e+07 0.000 1.00000
## continuous -6.085e+03 3.183e+06 -0.002 0.99847
## control -1.609e+04 2.690e+07 -0.001 0.99952
## current -1.077e+04 2.630e+07 0.000 0.99967
## describe -1.609e+04 2.083e+07 -0.001 0.99938
## designed -1.077e+04 2.197e+07 0.000 0.99961
## difficult 3.709e+03 2.250e+06 0.002 0.99868
## discuss -1.566e+04 1.092e+07 -0.001 0.99886
## drawn NA NA NA NA
## easy 1.357e+04 1.206e+07 0.001 0.99910
## empirical 1.369e+04 9.249e+06 0.001 0.99882
## equal 1.144e+04 9.394e+06 0.001 0.99903
## evs -2.972e+04 4.291e+07 -0.001 0.99945
## examine -1.216e+04 6.315e+06 -0.002 0.99846
## existing -1.482e+04 8.713e+07 0.000 0.99986
## expected -5.598e+03 1.838e+07 0.000 0.99976
## experience -8.787e-08 1.944e+07 0.000 1.00000
## extends 1.168e+04 4.297e+07 0.000 0.99978
## extensive -1.135e+03 3.134e+07 0.000 0.99997
## fixed 1.449e+04 4.484e+07 0.000 0.99974
## generalization -2.215e+03 8.198e+07 0.000 0.99998
## generation -8.624e+02 1.003e+07 0.000 0.99993
## genome -1.481e+04 8.492e+06 -0.002 0.99861
## implies NA NA NA NA
## independently 1.505e+04 3.466e+07 0.000 0.99965
## indirect 1.368e+04 9.175e+06 0.001 0.99881
## influence 4.287e+01 2.807e+07 0.000 1.00000
## influences 1.029e+04 2.848e+07 0.000 0.99971
## introduce -6.291e+03 1.095e+07 -0.001 0.99954
## iterations -1.870e+02 1.374e+07 0.000 0.99999
## judgments NA NA NA NA
## libraries NA NA NA NA
## library -1.604e+04 2.946e+07 -0.001 0.99957
## limitations 2.338e+02 3.617e+07 0.000 0.99999
## limited 4.238e+03 2.048e+06 0.002 0.99835
## means -1.467e+04 4.186e+07 0.000 0.99972
## measure 1.442e+04 2.410e+07 0.001 0.99952
## minimization 1.154e+04 2.074e+07 0.001 0.99956
## missing 1.392e+04 1.551e+07 0.001 0.99928
## modified 1.494e+03 3.181e+07 0.000 0.99996
## mouse 1.075e+03 2.722e+07 0.000 0.99997
## network -7.355e+03 3.956e+06 -0.002 0.99852
## neuronal 4.767e+03 3.914e+06 0.001 0.99903
## openness 9.574e+03 4.421e+07 0.000 0.99983
## optimal 1.430e+04 1.174e+07 0.001 0.99903
## paper -1.342e+03 4.259e+05 -0.003 0.99749
## parallel -9.937e+03 6.048e+06 -0.002 0.99869
## parameters 1.383e+04 5.279e+07 0.000 0.99979
## parametric 1.144e+04 1.627e+07 0.001 0.99944
## peptides -8.782e+03 4.142e+06 -0.002 0.99831
## personality -1.537e+04 3.681e+07 0.000 0.99967
## pie NA NA NA NA
## pml -2.615e+04 4.132e+07 -0.001 0.99949
## pro NA NA NA NA
## probability -1.171e+04 4.055e+07 0.000 0.99977
## processes NA NA NA NA
## propose -9.195e+03 5.952e+06 -0.002 0.99877
## question -9.195e+03 8.418e+06 -0.001 0.99913
## radically 1.884e+04 2.282e+07 0.001 0.99934
## range NA NA NA NA
## regularized NA NA NA NA
## representative -2.824e+04 2.248e+07 -0.001 0.99900
## require 3.128e+02 8.615e+06 0.000 0.99997
## response 1.078e+04 6.368e+06 0.002 0.99865
## risk -1.073e+04 1.389e+07 -0.001 0.99938
## sampling NA NA NA NA
## sces -1.616e+04 4.698e+07 0.000 0.99973
## selection 4.287e+01 1.965e+07 0.000 1.00000
## sequences -4.554e+02 1.504e+07 0.000 0.99998
## short -5.361e+03 9.521e+07 0.000 0.99996
## slightly -4.269e+02 2.352e+07 0.000 0.99999
## stability 1.701e+04 3.439e+07 0.000 0.99961
## structure -9.061e+03 5.052e+06 -0.002 0.99857
## students -1.306e+04 2.365e+07 -0.001 0.99956
## substantial 1.133e+04 7.623e+06 0.001 0.99881
## tailored -2.747e+02 2.322e+07 0.000 0.99999
## target 1.501e+03 2.632e+07 0.000 0.99995
## taxonomy NA NA NA NA
## theorem -5.853e+02 9.615e+06 0.000 0.99995
## therapy 7.685e+03 6.300e+06 0.001 0.99903
## trait -1.744e+04 1.837e+07 -0.001 0.99924
## transfer -1.287e+04 1.179e+07 -0.001 0.99913
## type -1.482e+04 8.563e+06 -0.002 0.99862
## typically 1.105e+04 6.614e+06 0.002 0.99867
## uncertainty -5.476e-08 1.591e+07 0.000 1.00000
## unlike 1.133e+04 1.548e+07 0.001 0.99942
## weakening NA NA NA NA
## widely NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 406.10 on 292 degrees of freedom
## Residual deviance: 69.59 on 198 degrees of freedom
## AIC: 259.59
##
## Number of Fisher Scoring iterations: 20
obj <- pull(test, class.text)
# 对测试集进行预测
test_pre <- predict.glm(train_model,
select(test, -class.text),
type = "response")
test_pre <- ifelse(test_pre >= 0.5,
levels(obj)[2], levels(obj)[1]) %>%
as.factor()
# 混淆矩阵计算精确度、KAPPA值
caret::confusionMatrix(test_pre, obj)
## Confusion Matrix and Statistics
##
## Reference
## Prediction AIMX CONT
## AIMX 35 2
## CONT 6 22
##
## Accuracy : 0.8769
## 95% CI : (0.7718, 0.9453)
## No Information Rate : 0.6308
## P-Value [Acc > NIR] : 8.823e-06
##
## Kappa : 0.7446
##
## Mcnemar's Test P-Value : 0.2888
##
## Sensitivity : 0.8537
## Specificity : 0.9167
## Pos Pred Value : 0.9459
## Neg Pred Value : 0.7857
## Prevalence : 0.6308
## Detection Rate : 0.5385
## Detection Prevalence : 0.5692
## Balanced Accuracy : 0.8852
##
## 'Positive' Class : AIMX
# ROC曲线
p_load(ROCit)
# 需要将因子变量转换为数值型
obj_roc <- rocit(score = as.numeric(obj),
class = as.numeric(test_pre))
summary(obj_roc)
## Method used: empirical
## Number of positive(s): 28
## Number of negative(s): 37
## Area under curve: 0.8658
plot(obj_roc, legend = F, YIndex = F)
图中虚线表示基准值,如果实线在虚线之下,说明模型效果不如随机猜测有效。
2、多分类
与二分类类似,标签多于两个,算法包括决策树、朴素贝叶斯、支持向量机等,而决策树又包括C4.5、CART、C5.0方法,本例使用CART方法。
train_raw <- train_sentences %>%
as_tibble() %>%
filter(class.text %in% c("AIMX", "CONT", "BASE"))
test_raw <- test_sentences %>%
as_tibble() %>%
filter(class.text %in% c("AIMX", "CONT", "BASE"))
table(train_raw$class.text)
##
## AIMX BASE CONT
## 149 48 144
sel_word <- train_raw %>%
# 分词
unnest_tokens(word, text) %>%
# 去除停用词
anti_join(stop_words) %>%
group_by(class.text) %>%
count(word) %>%
ungroup() %>%
bind_tf_idf(word, class.text, n) %>%
distinct(word, .keep_all = T) %>%
# 前100
top_n(100, tf_idf) %>%
select(word, tf_idf)
sel_word
## # A tibble: 127 × 2
## word tf_idf
## <chr> <dbl>
## 1 adopted 0.00334
## 2 algebraic 0.00334
## 3 attempt 0.00334
## 4 balanced 0.00334
## 5 bernoulli 0.00334
## 6 circumstances 0.00334
## 7 continuous 0.00556
## 8 drawn 0.00334
## 9 experience 0.00389
## 10 implies 0.00334
## # … with 117 more rows
# 重新构造训练集和测试集
train1 <- train_raw %>%
# 为每一个文本单独编号
mutate(id = 1:n()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
# 筛选目标词
inner_join(sel_word) %>%
# 去除句内重复
distinct(id, word, .keep_all = T) %>%
# 长表转宽表
wider_dt(name = "word",
value = "tf_idf",
fill = 0)
# 有可能有的文档完全没有目标词,所以需要补充,并标记为0
train <- train_raw %>%
mutate(id = 1:n()) %>%
select(id, class.text) %>%
left_join(train1) %>%
# 缺失值插入0
replace_na_dt(to = 0) %>%
select(-id)
# 测试集进行同样的操作
test1 <- test_raw %>%
# 为每一个文本单独编号
mutate(id = 1:n()) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
# 筛选目标词
inner_join(sel_word) %>%
# 去除句内重复
distinct(id, word, .keep_all = T) %>%
# 长表转宽表
wider_dt(name = "word",
value = "tf_idf",
fill = 0)
# 有可能有的文档完全没有目标词,所以需要补充,并标记为0
test2 <- test_raw %>%
mutate(id = 1:n()) %>%
select(id, class.text) %>%
left_join(test1) %>%
# 缺失值插入0
replace_na_dt(to = 0) %>%
select(-id)
# 数据框需要补齐到与训练集长度一致,所以需要补全所有单词,并标记为0
# 获取需要补全的单词
to_add <- setdiff(names(train), names(test2))
test <- test2[, (to_add) := 0]
# 检查长度是否一致
length(train) == length(test)
## [1] TRUE
# 检查两列名称是否一致
setequal(names(train), names(test))
## [1] TRUE
# 响应变量转化为因子型,否则很多机器学习模型会误认为是回归问题
train <- train %>%
mutate_dt(class.text = as.factor(class.text))
test <- test %>%
mutate_dt(class.text = as.factor(class.text))
# 建模分析与评估
p_load(rpart)
rpart_model <- rpart(class.text ~ ., data = train)
summary(rpart_model)
## Call:
## rpart(formula = class.text ~ ., data = train)
## n= 341
##
## CP nsplit rel error xerror xstd
## 1 0.1979167 0 1.0000000 1.0781250 0.04697417
## 2 0.0100000 1 0.8020833 0.8020833 0.04786331
##
## Variable importance
## paper adopted attempt drawn implies weakening
## 60 8 8 8 8 8
##
## Node number 1: 341 observations, complexity param=0.1979167
## predicted class=AIMX expected loss=0.5630499 P(node) =1
## class counts: 149 48 144
## probabilities: 0.437 0.141 0.422
## left son=2 (45 obs) right son=3 (296 obs)
## Primary splits:
## paper < 0.00441169 to the right, improve=23.668390, (0 missing)
## difficult < 0.00398667 to the left, improve= 7.949241, (0 missing)
## limited < 0.003701908 to the left, improve= 7.358934, (0 missing)
## introduce < 0.002779889 to the right, improve= 5.307298, (0 missing)
## neuronal < 0.003417146 to the left, improve= 5.033265, (0 missing)
## Surrogate splits:
## adopted < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
## attempt < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
## drawn < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
## implies < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
## weakening < 0.001667934 to the right, agree=0.886, adj=0.133, (0 split)
##
## Node number 2: 45 observations
## predicted class=AIMX expected loss=0.04444444 P(node) =0.1319648
## class counts: 43 2 0
## probabilities: 0.956 0.044 0.000
##
## Node number 3: 296 observations
## predicted class=CONT expected loss=0.5135135 P(node) =0.8680352
## class counts: 106 46 144
## probabilities: 0.358 0.155 0.486
# 测试集
obj <- pull(test, class.text)
# 对测试集进行预测
test_pre <- predict(rpart_model, test, type = "class") %>%
as.factor()
# 混淆矩阵计算精确度、KAPPA值
caret::confusionMatrix(test_pre, obj)
## Confusion Matrix and Statistics
##
## Reference
## Prediction AIMX BASE CONT
## AIMX 15 0 0
## BASE 0 0 0
## CONT 26 13 24
##
## Overall Statistics
##
## Accuracy : 0.5
## 95% CI : (0.3846, 0.6154)
## No Information Rate : 0.5256
## P-Value [Acc > NIR] : 0.7149
##
## Kappa : 0.2312
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: AIMX Class: BASE Class: CONT
## Sensitivity 0.3659 0.0000 1.0000
## Specificity 1.0000 1.0000 0.2778
## Pos Pred Value 1.0000 NaN 0.3810
## Neg Pred Value 0.5873 0.8333 1.0000
## Prevalence 0.5256 0.1667 0.3077
## Detection Rate 0.1923 0.0000 0.3077
## Detection Prevalence 0.1923 0.0000 0.8077
## Balanced Accuracy 0.6829 0.5000 0.6389