1. 简介
基于转录组训练集中所有样本,构建风险模型,筛选基因
机器学习方法:
- LASSO 逻辑回归算法构建模型筛选候选基因
- SVM-RFE 向量机-递归特征消除算法筛选候选基因
2. 数据信息
上一步获得的候选基因
image.png
3. 思路
- 使用R包“glmnet” LASSO逻辑回归算法构建模型,将lambda最小时模型中的候选基因作为满足要求的基因,用于后续分析。
- 使用R包“e1071”中的支持向量机-递归特征消除筛选候选基因,将该算法错误率最低时的候选基因作为符合条件的基因,用于后续分析。
- 将两种方法获得的基因取交集,作为特征基因。
4. 代码
library(glmnet)
library(tidyverse)
library(e1071)
library(caret)
library(ggplot2)
library(cowplot)
library(ggplotify)
##----- 1. 数据准备 -----
colnames(Candidate_names_df) <- "Symbol"
train_genes <- rownames_to_column(as.data.frame(exp_train),"Symbol")
train_genes <- merge(train_genes,Candidate_names_df, by = "Symbol", all.x = T )
train_genes <- column_to_rownames(train_genes,"Symbol")
hub_train_genes <- train_genes[rownames(train_genes) %in% Candidate_names, ]
p4 = identical(colnames(hub_train_genes),train_group$Sample);p4
label <- ifelse(train_group$label == "AAV", 1, 0)
##---- 2. 构建LASSO回归模型 ----
set.seed(321)
lasso <- glmnet(x = t(hub_train_genes),
y = label,
family = "gaussian",
standardize = TRUE,
nlambda = 100,
alpha = 1)
lasso_cv <- cv.glmnet(x = t(hub_train_genes),
y = label,
alpha = 1,
family = "gaussian",
standardize = TRUE,
nfolds = 10)
png("04.lasso_coefficient_path.png", width = 800, height = 600)
pdf("04.lasso_coefficient_path.pdf", width = 10, height = 10)
plot(lasso, xvar = "norm", label = TRUE)
abline(v = sum(abs(coef(lasso, s = lasso_cv$lambda.min)[-1])),
lty = 2, col = "red")
text(x = sum(abs(coef(lasso, s = lasso_cv$lambda.min)[-1])),
y = max(lasso$beta),
labels = paste("lambda.min =", round(lasso_cv$lambda.min, 4)),
pos = 4, col = "red", cex = 0.8)
dev.off()
png("04.lasso_cross_verify.png", width = 800, height = 600)
pdf("04.lasso_cross_verify.pdf", width = 10, height = 10)
plot(lasso_cv)
abline(v = log(lasso_cv$lambda.min), lty = 3)
text(x = log(lasso_cv$lambda.min),
y = max(lasso_cv$cvm),
labels = paste("lambda.min =", round(lasso_cv$lambda.min, 4)),
pos = 4, offset = 1)
dev.off()
optimal_lambda <- lasso_cv$lambda.min
print(optimal_lambda)
lasso_coef <- coef(lasso_cv, s = "lambda.min")
selected_genes_lasso <- rownames(lasso_coef)[which(lasso_coef != 0)]
selected_genes_lasso <- selected_genes_lasso[-1]
print(selected_genes_lasso)
write.csv(selected_genes_lasso, file = "04.selected_genes_LASSO.csv", row.names = FALSE)
##---- 3. SVM-RFE -----
svm_rfe <- function(data, labels, sizes) {
ctrl <- rfeControl(functions = caretFuncs, method = "cv", number = 10)
svmProfile <- rfe(data, labels, sizes = sizes, rfeControl = ctrl, method = "svmRadial")
return(svmProfile)
}
set.seed(123)
sizes <- seq(1, ncol(t(hub_train_genes)), by = 1)
svmProfile <- svm_rfe(t(hub_train_genes), label, sizes)
selected_genes_svm <- svmProfile$optVariables
write.csv(selected_genes_svm, file = "04.selected_genes_SVM.csv", row.names = FALSE)
results_svm <- svmProfile$results
pdf(file = "04.SVM_RFE_Accuracy.pdf",width = 5,height = 4,family='Times')
a <- dev.cur()
png(file = "04.SVM_RFE_Accuracy.png",width = 5, height=4, units="in", res=600,family='Times')
dev.control("enable")
par(mar = c(2,2,2,2));
plot(svmProfile, type=c("o"),
xgap.axis = 1
)
dev.copy(which = a)
dev.off()
dev.off()
## ------- venn ------
Venn_groups2 <- list( SVM = selected_genes_svm, LASSO = selected_genes_lasso)
venn.diagram(Venn_groups2, filename = 'venn_lasso_svm.png', imagetype = 'png',
fill = c('red', 'blue'),
alpha = 0.50,
cat.col = rep('black', 2),
scaled = F,
col = 'black',
cex = 2,
fontfamily = 'serif',
cat.cex = 1.5,
cat.fontfamily = 'serif',
cat.default.pos = "outer",
resolution = 400,
compression = "lzw",
cat.pos = c(-175, 175),
print.mode = c("raw", "percent")
)
venn2 <- venn.diagram(Venn_groups2,
filename = NULL,
fill = c('red', 'blue'),
alpha = 0.50,
cat.col = rep('black', 2),
scaled = F,
col = 'black',
cex = 2,
fontfamily = 'serif',
cat.cex = 1.5,
cat.fontfamily = 'serif',
cat.default.pos = "outer",
resolution = 400,
compression = "lzw",
cat.pos = c(-175, 175),
print.mode = c("raw", "percent")
)
pdf(file="venn_lasso_svm.pdf")
grid.draw(venn2)
dev.off()
keygene_names <- intersect(selected_genes_svm,selected_genes_lasso)
5. 结果展示
04.lasso_coefficient_path.png
04.lasso_cross_verify.png
04.SVM_RFE_Accuracy.png
venn_lasso_svm.png