参考学习资料:TCGA的cox模型构建和风险森林图
接着之前做差异分析得到的数据数据
再来进一步分析:
1.准备输入数据
rm(list = ls())
options(stringsAsFactors = F)
load("tcga-kirc-raw.Rdata")
exprSet = expr[,group_list=="tumor"]
dim(exprSet)
## [1] 812 545
head(meta)
exprSet[1:4,1:4]
head(colnames(exprSet))
head(meta$submitter_id)
expr2=exprSet[,meta$submitter_id%in%substring(colnames(exprSet),1,12)]
meta=meta[meta$submitter_id%in%substring(colnames(exprSet),1,12),]
dim(meta)
dim(expr2)
exprSet=expr2
dim(exprSet)
s1=unique(colnames(expr2))#去重复
colnames(expr2)=substring(colnames(exprSet),1,12)
head(colnames(expr2))
s2=unique(colnames(expr2))
expr3=expr2[,s2]
#meta$submitter_id%in%colnames(expr3)
meta=meta[meta$submitter_id%in%colnames(expr3),]
exprSet=expr3
dim(meta)
## [1] 495 8
dim(exprSet)
## [1] 812 495
2.挑选感兴趣的基因构建coxph模型
e=t(exprSet[c("hsa-mir-21","hsa-mir-181a-1","hsa-mir-424",'hsa-mir-192',"hsa-mir-195"),])
e=log2(e)
colnames(e)=c('miR21','miR181a','miR424','miR192','miR195')
dat=cbind(meta,e)
dat$gender=factor(dat$gender)
dat$ajcc_pathologic_stage=factor(dat$ajcc_pathologic_stage)
colnames(dat)
dat$status=ifelse(dat$vital_status=="Alive",1,0)
colnames(dat)[8]="Stage"
#用survival::coxph()函数构建模型
library(survival)
library(survminer)
s=Surv(age_at_diagnosis, status) ~ gender + Stage + miR21+miR181a+miR424+miR192+miR195
model <- coxph(s, data = dat )

建模前
3.模型可视化-森林图
options(scipen=1)
ggforest(model, data =dat, main = "Hazard ratio",
cpositions = c(0.10, 0.22, 0.4),
fontsize = 1.0,
refLabel = "1",
noDigits = 4)
4.模型预测
fp <- predict(model)
summary(model,data=dat)$concordance
library(Hmisc)
options(scipen=200)
#with(dat,rcorr.cens(fp,Surv(age_at_diagnosis, status)))
模型预测这一步看教程是个例子,弄了蛮久还是不太明白。
5.切割数据构建模型并预测
5.1 切割数据
library(caret)
set.seed(12345679)
sam<- createDataPartition(dat$status, p = .5, list = FALSE)
head(sam)
## Resample1
## [1,] 5
## [2,] 9
## [3,] 13
## [4,] 17
## [5,] 19
## [6,] 21
train <- exprSet[,sam]
test <- exprSet[,-sam]
train_meta <- meta[sam,]
test_meta <- meta[-sam,]
prop.table(table(train_meta$ajcc_pathologic_stage))
prop.table(table(test_meta$ajcc_pathologic_stage))
prop.table(table(test_meta$race))
prop.table(table(train_meta$race))
5.2 切割后的train数据集建模
e=t(train[c("hsa-mir-21","hsa-mir-181a-1","hsa-mir-424",'hsa-mir-192',"hsa-mir-195"),])
e=log2(e)
colnames(e)=c('miR21','miR181a','miR424','miR192','miR195')
dat=cbind(train_meta,e)
dat$gender=factor(dat$gender)
colnames(dat)[8]="Stage"
dat$Stage=factor(dat$Stage)
colnames(dat)
dat$status=ifelse(dat$vital_status=="Alive",1,0)
s=Surv(age_at_diagnosis, status) ~ gender + Stage + miR21+miR181a+miR424+miR192+miR195
model <- coxph(s, data = dat )
5.3 模型可视化
options(scipen=1)
ggforest(model, data =dat,
main = "Hazard ratio",
cpositions = c(0.10, 0.22, 0.4),
fontsize = 1.0,
refLabel = "1", noDigits = 4)

建模后
Concordance Index:0.65
C-index用于计算生存分析中的COX模型预测值与真实之间的区分度(discrimination),也称为Harrell's concordanceindex。C-index在0.5-1之间。0.5为完全不一致,说明该模型没有预测作用,1为完全一致,说明该模型预测结果与实际完全一致。
本次建模不太行,但是还是有一定的预测作用。涉及到公式方面真的有点蒙,过程比较复杂不好理解,但是我应用的话知道判断的标准即可。