#单因素cox回归用于筛选与预后相关的免疫基因
#文件要求,带有临床生存与基因表达值的文件clinical+exp.txt
#生信课视频上的代码:不知道为什么就是运行不出来,于是我放弃了
library(survival)
pFilter=0.05
rt=read.table("clinical+exp.txt",header = T,sep="\t",check.names = F,row.names = 1)
outTab=data.frame()
sigGenes=c("funtime","funstat")
for (i in colnames(rt[,3:ncol(rt)])){
cox<-coxph(Surv(funtime,funstat)~rt[,i],data = rt)
coxSummary=summary(cox)
coxP=coxSummary$coefficients[,"Pr(>|z|)"]
if(coxP<pFilter){
sigGenes=c(sigGenes,i)
outTab=rbind(outTab,cbind(id=i,
HR=coxSummary$conf.int[,"exp(coef)"],
HRL=coxSummary$conf.int[,"lower .95"],
HRH=coxSummary$conf.int[,"upper .95"],
pValue=coxSummary$coefficients[,"Pr(>|z|)"]
}
}
#网友提供的方法
#需要多一个变量名的文件,是指差异表达的基因名,后来想想要不是有些捣乱的基因名,是不是用于做单因素分析的基因就是所需的covariates
rt=read.table("clinical_exp_gaibian.txt",header = T,sep="\t",check.names = F,row.names = 1)
covariates<-read.csv("names.csv")
covariates<-as.vector(covariates)
covariates
covariates=t(covariates)
time<-rt$funtime
time<-as.vector(time)
status<-rt$funstat
#有个小问题,就是变量命名里不能有-例如HLA-A,会出现找不到HLA的错误,这里把这些带-的都删掉了
univ_formulas <- sapply(covariates,
function(x) as.formula(paste('Surv(time, status)~', x)))
univ_models <- lapply( univ_formulas, function(x){coxph(x, data = rt)})
univ_results <- lapply(univ_models,
function(x){
x <- summary(x)
p.value<-signif(x$wald["pvalue"])
wald.test<-signif(x$wald["test"])
beta<-signif(x$coef[1]);#coeficient beta
HR <-signif(x$coef[2]);#exp(beta)#,digits=2代表保留两位有效数字
#HR <-signif(x$coef[2],digits = 2);
HR.confint.lower <- signif(x$conf.int[,"lower .95"])#,3,是3个有效数字
HR.confint.upper <- signif(x$conf.int[,"upper .95"])
HR <- paste0(HR, " (",
HR.confint.lower, "-", HR.confint.upper, ")")
res<-c(beta, HR, wald.test, p.value)
names(res)<-c("beta", "HR (95% CI for HR)", "wald.test",
"p.value")
return(res)
#return(exp(cbind(coef(x),confint(x))))
})
res <- t(as.data.frame(univ_results, check.names = FALSE))
as.data.frame(res)
write.csv(res,"cox_singel.csv")
#LSAAO回归
#文件要求:与单因素回归的文件格式相似
LASSO是为了避免过度拟合和删除高度相关的基因
library(glmnet)
library(survival)
setwd("C:/Users/仙女/Documents/liver cancer/LASSO")
rt=read.table("Lasso.txt",header = T,sep="\t",check.names = F,row.names = 1)
rt$funtime[rt$funtime<=0]=0.003
#LASSO回归中生存时间不能是0,否则会报错
x=as.matrix(rt[,c(3:ncol(rt))])
y=data.matrix(Surv(rt$funtime,rt$funstat))
fit<-glmnet(x,y,family = "cox",maxit = 1000)
pdf(file = "lambda.pdf",onefile = FALSE)
plot(fit,xvar="lambda",label=TRUE)
dev.off
cvfit<-cv.glmnet(x,y,family="cox",maxit=1000)
#做交叉验证并输出图形
pdf("cvfit.pdf",onefile = FALSE)
#onefile=FALSE很重要,因为如果没有这个输出图应该在pdf的第二页,前面有个空白的pdf页
plot(cvfit)
abline(v=log(c(cvfit$lambda.min,cvfit$lambda.1se)),lty="dashed")
dev.off()
#maxit=1000是随机模拟1000次,这代表这次做完之后把脚本关闭重新再做得出的基因是不相同的
coef<-coef(fit,s=cvfit$lambda.min)
index<-which(coef !=0)
actCoef<-coef[index]
lassoGene=rownames(coef)[index]
lassoGene=c("funtime","funstat",lassoGene)
lassoSigExp=rt[,lassoGene]
lassoSigExp=cbind(id=row.names(lassoSigExp),lassoSigExp)
write.table(lassoSigExp,file = "lassoSigExp2.txt",sep="\t",row.names = F,quote=F)
#虽然我的lambda图没有做成
#多变量cox回归及建模人群预测模型的构建
#文件要求:带有生存数据的基因表达
setwd("C:/Users/仙女/Documents/liver cancer/multi_cox")
rt=read.table("lassoSigExp.txt",header = T,sep="\t",check.names = F,row.names = 1)
rt[,"funtime"]=rt[,"funtime"]/365
#这里生存时间要转换成年
#使用建模数据构建模型
multiCox=coxph(Surv(funtime,funstat)~.,data = rt)
multiCox=step(multiCox,direction = "both")
multiCoxSum=summary(multiCox)
#输出模型相关信息
outTab=data.frame()
outTab=cbind(
coef=multiCoxSum$coefficients[,"coef"],
HR=multiCoxSum$conf.int[,"exp(coef)"],
HR.95L=multiCoxSum$conf.int[,"lower .95"],
HR.95H=multiCoxSum$conf.int[,"upper .95"],
pValue=multiCoxSum$coefficients[,"Pr(>|z|)"])
outTab=cbind(id=row.names(outTab),outTab)
write.table(outTab,file = "multiCox.xls",sep = "\t",row.names = F,quote=F)
#其实每运行一次产生的结果都不一样,即使是相同的lassoSigExp文件
#结果显示有的基因HR的p值并不小于0.05,这也是可以的,它可能跟别的基因互补一起预测生存
#输出病人的风险值
riskScore=predict(multiCox,type = "risk",newdata = rt)
coxGene=row.names(multiCoxSum$coefficients)
coxGene=gsub("`","",coxGene)
outCol=c("funtime","funstat",coxGene)
risk=as.vector(ifelse(riskScore>median(riskScore),"high","low"))
write.table(cbind(id=row.names(cbind(rt[,outCol],riskScore,risk)),cbind(rt[,outCol],riskScore,risk)),
file = "risk.txt",
sep="\t",
quote=F,
row.names = F)
#可是没有预测的具体公式,好像是summary(multiCox)的coef值
#模型验证
#输入验证人群的风险文件
rtTest=read.table("test.txt",header = T,sep="\t",check.names = F,row.names = 1)
rtTest[,"funtime"]=rtTest[,"funtime"]/365
riskScoreTest=predict(multiCox,type = "risk",newdata = rtTest)
riskTest=as.vector(ifelse(riskScoreTest>medianTrainRisk,"high","low"))
write.table(cbind(id=rownames(cbind(rtTest[,outCol],riskScoreTest,riskTest)),cbind(rtTest[,outCol],riskScoreTest,riskTest)),
file="riskTest.txt",
sep="\t",
quote=F,
row.names=F)