【Rna-seq 分析流程】04 机器学习-构建风险模型

1. 简介

基于转录组训练集中所有样本,构建风险模型,筛选基因
机器学习方法:

  • LASSO 逻辑回归算法构建模型筛选候选基因
  • SVM-RFE 向量机-递归特征消除算法筛选候选基因

2. 数据信息

上一步获得的候选基因


image.png

3. 思路

  1. 使用R包“glmnet” LASSO逻辑回归算法构建模型,将lambda最小时模型中的候选基因作为满足要求的基因,用于后续分析。
  2. 使用R包“e1071”中的支持向量机-递归特征消除筛选候选基因,将该算法错误率最低时的候选基因作为符合条件的基因,用于后续分析。
  3. 将两种方法获得的基因取交集,作为特征基因。

4. 代码

library(glmnet)
library(tidyverse)
library(e1071)
library(caret)
library(ggplot2)
library(cowplot)
library(ggplotify)



##----- 1. 数据准备 -----
colnames(Candidate_names_df)  <- "Symbol"

train_genes <- rownames_to_column(as.data.frame(exp_train),"Symbol")
train_genes <- merge(train_genes,Candidate_names_df, by = "Symbol", all.x = T )
train_genes <- column_to_rownames(train_genes,"Symbol")
hub_train_genes <- train_genes[rownames(train_genes) %in% Candidate_names, ]

p4 = identical(colnames(hub_train_genes),train_group$Sample);p4
label <- ifelse(train_group$label == "AAV", 1, 0) 


##---- 2. 构建LASSO回归模型 ----
set.seed(321)
lasso <- glmnet(x = t(hub_train_genes), 
                y = label, 
                family = "gaussian",
                standardize = TRUE,
                nlambda = 100, 
                alpha = 1)
lasso_cv <- cv.glmnet(x = t(hub_train_genes), 
                      y = label, 
                      alpha = 1, 
                      family = "gaussian",  
                      standardize = TRUE,  
                      nfolds = 10)
png("04.lasso_coefficient_path.png", width = 800, height = 600)
pdf("04.lasso_coefficient_path.pdf", width = 10, height = 10)
plot(lasso, xvar = "norm", label = TRUE)
abline(v = sum(abs(coef(lasso, s = lasso_cv$lambda.min)[-1])), 
       lty = 2, col = "red")
text(x = sum(abs(coef(lasso, s = lasso_cv$lambda.min)[-1])), 
     y = max(lasso$beta), 
     labels = paste("lambda.min =", round(lasso_cv$lambda.min, 4)), 
     pos = 4, col = "red", cex = 0.8)
dev.off()

png("04.lasso_cross_verify.png", width = 800, height = 600)
pdf("04.lasso_cross_verify.pdf", width = 10, height = 10)
plot(lasso_cv) 
abline(v = log(lasso_cv$lambda.min), lty = 3)
text(x = log(lasso_cv$lambda.min), 
     y = max(lasso_cv$cvm), 
     labels = paste("lambda.min =", round(lasso_cv$lambda.min, 4)), 
     pos = 4, offset = 1)
dev.off()

optimal_lambda <- lasso_cv$lambda.min
print(optimal_lambda)
lasso_coef <- coef(lasso_cv, s = "lambda.min")
selected_genes_lasso <- rownames(lasso_coef)[which(lasso_coef != 0)] 
selected_genes_lasso <- selected_genes_lasso[-1] 
print(selected_genes_lasso)
write.csv(selected_genes_lasso, file = "04.selected_genes_LASSO.csv", row.names = FALSE)



##---- 3. SVM-RFE -----
svm_rfe <- function(data, labels, sizes) {
  ctrl <- rfeControl(functions = caretFuncs, method = "cv", number = 10)
  svmProfile <- rfe(data, labels, sizes = sizes, rfeControl = ctrl, method = "svmRadial")
  return(svmProfile)
}


set.seed(123)
sizes <- seq(1, ncol(t(hub_train_genes)), by = 1) 
svmProfile <- svm_rfe(t(hub_train_genes), label, sizes)
selected_genes_svm <- svmProfile$optVariables
write.csv(selected_genes_svm, file = "04.selected_genes_SVM.csv", row.names = FALSE)

results_svm <- svmProfile$results

pdf(file = "04.SVM_RFE_Accuracy.pdf",width = 5,height = 4,family='Times')
a <- dev.cur()  
png(file = "04.SVM_RFE_Accuracy.png",width = 5, height=4, units="in", res=600,family='Times')
dev.control("enable")
par(mar = c(2,2,2,2));
plot(svmProfile, type=c("o"),
     xgap.axis = 1
)
dev.copy(which = a)  
dev.off()
dev.off()


## ------- venn ------
Venn_groups2 <- list( SVM = selected_genes_svm, LASSO = selected_genes_lasso)
venn.diagram(Venn_groups2, filename = 'venn_lasso_svm.png', imagetype = 'png', 
             fill = c('red', 'blue'), 
             alpha = 0.50,   
             cat.col = rep('black', 2),  
             scaled = F,  
             col = 'black',   
             cex = 2,  
             fontfamily = 'serif',   
             cat.cex = 1.5,  
             cat.fontfamily = 'serif',
             cat.default.pos = "outer", 
             resolution = 400, 
             compression = "lzw",
             cat.pos = c(-175, 175), 
             print.mode = c("raw", "percent")  
)

venn2 <- venn.diagram(Venn_groups2, 
                      filename = NULL,
                      fill = c('red', 'blue'), 
                      alpha = 0.50,   
                      cat.col = rep('black', 2),  
                      scaled = F,  
                      col = 'black',   
                      cex = 2,  
                      fontfamily = 'serif',   
                      cat.cex = 1.5,  
                      cat.fontfamily = 'serif',
                      cat.default.pos = "outer", 
                      resolution = 400, 
                      compression = "lzw",
                      cat.pos = c(-175, 175), 
                      print.mode = c("raw", "percent") 
)
pdf(file="venn_lasso_svm.pdf")
grid.draw(venn2)
dev.off()

keygene_names <- intersect(selected_genes_svm,selected_genes_lasso) 

5. 结果展示

04.lasso_coefficient_path.png
04.lasso_cross_verify.png
04.SVM_RFE_Accuracy.png
venn_lasso_svm.png
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容