简书:没有生物学重复的转录组数据怎么进行差异分析?
edgeR需要的数据是reads数,可以设置BCV值,做单样本的差异分析。
edgeR包可以做无重复的差异分析,不过需要认为指定一个dispersion值(设置BCV值),这样得到的结果比较主观,不同的人就可以有不同的结果。通常如果是实验控制的好的人类数据,那么选择BCV=0.4,比较好的模式生物选择BCV=0.1。
6 LUAD验证
6.2 差异基因分析和功能分析
差异基因分析
library(limma)
rt=read.table("OV.txt",header=T,sep="\t",check.names=F)
#如果一个基因占了多行,取均值
rt=as.matrix(rt)
rownames(rt)=rt[,1]
exp=rt[,2:ncol(rt)]
dimnames=list(rownames(exp),colnames(exp))
data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data=avereps(data)
data<-as.data.frame(data)
nrow(data)
data=data[rowMeans(data)>0.5,]
group_list=c(rep("high",177),rep("low",176))
table(group_list)
library(edgeR)
dge <- DGEList(counts=data,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~0+group_list)
rownames(design)<-colnames(dge)
colnames(design)<-levels(group_list)
dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1))
DEG=topTags(fit2, n=nrow(data))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 2
DEG$change = as.factor(
ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT'))
head(DEG)
table(DEG$change)
edgeR_DEG <- DEG
#提取差异基因表达矩阵
DEG<-DEG[DEG$change!="NOT",]
library(data.table)
#fintersect求交集
a<-data.table(rownames(DEG))
diff<-data[match(a$V1,row.names(data)),]
write.csv(diff,file = "diff.csv",quote=F,row.names = T)
source("3-plotfunction.R")
logFC_cutoff <- 2
dat = log(data+1)
pca.plot = draw_pca(dat,group_list)
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
h2 = draw_heatmap(data[cg2,],group_list)
v2 = draw_volcano(test = edgeR_DEG[,c(1,4,6)],pkg = 2)
v2
library(patchwork)
(h2) / (v2) +plot_layout(guides = 'collect')
ggsave("heat_volcano.png",width = 7,height = 5)
功能分析
library("org.Hs.eg.db")
rt=read.csv("diff.csv",sep=",",check.names=F,header=T)
rt<-rt[,1]
rt<-as.data.frame(rt)
genes=as.vector(rt[,1])
entrezIDs <- mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
entrezIDs <- as.character(entrezIDs)
out=cbind(rt,entrezID=entrezIDs)
write.table(out,file="id.txt",sep="\t",quote=F,row.names=F)
###############GO分析
library("clusterProfiler")
library("org.Hs.eg.db")
library("enrichplot")
library("ggplot2")
rt=read.table("id.txt",sep="\t",header=T,check.names=F)
rt=rt[is.na(rt[,"entrezID"])==F,]
cor=rt$cor
gene=rt$entrezID
names(cor)=gene
#GO富集分析
kk <- enrichGO(gene = gene,
OrgDb = org.Hs.eg.db,
pvalueCutoff =0.05,
qvalueCutoff = 0.05,
ont="all",
readable =T)
write.table(kk,file="GO.txt",sep="\t",quote=F,row.names = F)
#柱状图
tiff(file="GO.tiff",width = 26,height = 20,units ="cm",compression="lzw",bg="white",res=600)
barplot(kk, drop = TRUE, showCategory =10,split="ONTOLOGY") + facet_grid(ONTOLOGY~., scale='free')
dev.off()
#####################KEGG分析
library("clusterProfiler")
library("org.Hs.eg.db")
library("enrichplot")
library("ggplot2")
setwd("C:\\Users\\lexb4\\Desktop\\CCLE\\09.KEGG")
rt=read.table("id.txt",sep="\t",header=T,check.names=F)
rt=rt[is.na(rt[,"entrezID"])==F,]
cor=rt$cor
gene=rt$entrezID
names(cor)=gene
#kegg富集分析
kk <- enrichKEGG(gene = gene, organism = "hsa", pvalueCutoff =0.05, qvalueCutoff =0.05)
write.table(kk,file="KEGG.txt",sep="\t",quote=F,row.names = F)
#柱状图
tiff(file="barplot.tiff",width = 20,height = 12,units ="cm",compression="lzw",bg="white",res=600)
barplot(kk, drop = TRUE, showCategory = 20)
dev.off()
6.3 批量KM曲线
输入数据KM.TXT,数据是FPKM.
library(survival)
rt=read.table("KM.txt",header=T,sep="\t",check.names=F)
outTab=data.frame()
for(gene in colnames(rt[,4:ncol(rt)])){
a=rt[,gene]<=median(rt[,gene])
diff=survdiff(Surv(futime, fustat) ~a,data = rt)
pValue=1-pchisq(diff$chisq,df=1)
outTab=rbind(outTab,cbind(gene=gene,pvalue=pValue))
fit <- survfit(Surv(futime, fustat) ~ a, data = rt)
summary(fit)
#只将p<0.05的基因画出来。
if(pValue<0.05){
if(pValue<0.001){
pValue="<0.001"
}else{
pValue=round(pValue,3)
pValue=paste0("=",pValue)
}
pdf(file=paste(gene,".survival.pdf",sep=""),
width=6,
height=6)
plot(fit,
lwd=2,
col=c("red","blue"),
xlab="Time (year)",
mark.time=T, #显示删失数据
ylab="Survival rate",
main=paste(gene,"(p", pValue ,")",sep=""))
legend("topright",
c("High","Low"),
lwd=2,
col=c("red","blue"))
dev.off()
}
}
write.table(outTab,file="survival.xls",sep="\t",row.names=F,quote=F)
6.4 差异基因进行COX模型构建
用FPKM数据做COX模型的构建,用OS相关的基因做分析。
输入数据exptime.txt
单因素cox
pFilter=0.01 #定义单因素显著性
library(survival) #引用包
rt=read.table("expTime.txt",header=T,sep="\t",check.names=F,row.names=1) #读取输入文件
sigGenes=c("futime","fustat")
outTab=data.frame()
for(i in colnames(rt[,3:ncol(rt)])){
cox <- coxph(Surv(futime, fustat) ~ rt[,i], data = rt)
coxSummary = summary(cox)
coxP=coxSummary$coefficients[,"Pr(>|z|)"]
outTab=rbind(outTab,
cbind(id=i,
HR=coxSummary$conf.int[,"exp(coef)"],
HR.95L=coxSummary$conf.int[,"lower .95"],
HR.95H=coxSummary$conf.int[,"upper .95"],
pvalue=coxSummary$coefficients[,"Pr(>|z|)"])
)
if(coxP<pFilter){
sigGenes=c(sigGenes,i)
}
}
write.table(outTab,file="uniCox.xls",sep="\t",row.names=F,quote=F)
uniSigExp=rt[,sigGenes]
uniSigExp=cbind(id=row.names(uniSigExp),uniSigExp)
write.table(uniSigExp,file="uniSigExp.txt",sep="\t",row.names=F,quote=F)
LASSO回归
library("glmnet")
library("survival")
rt=read.table("uniSigExp.txt",header=T,sep="\t",row.names=1,check.names=F) #读取文件
rt$futime[rt$futime<=0]=1
x=as.matrix(rt[,c(3:ncol(rt))])
y=data.matrix(Surv(rt$futime,rt$fustat))
fit <- glmnet(x, y, family = "cox", maxit = 1000)
pdf("lambda.pdf")
plot(fit, xvar = "lambda", label = TRUE)
dev.off()
cvfit <- cv.glmnet(x, y, family="cox", maxit = 1000)
pdf("cvfit.pdf")
plot(cvfit)
abline(v=log(c(cvfit$lambda.min,cvfit$lambda.1se)),lty="dashed")
dev.off()
coef <- coef(fit, s = cvfit$lambda.min)
index <- which(coef != 0)
actCoef <- coef[index]
lassoGene=row.names(coef)[index]
lassoGene=c("futime","fustat",lassoGene)
lassoSigExp=rt[,lassoGene]
lassoSigExp=cbind(id=row.names(lassoSigExp),lassoSigExp)
write.table(lassoSigExp,file="lassoSigExp.txt",sep="\t",row.names=F,quote=F)
多因素COX模型的构建
library(survival)
library(survminer)
rt=read.table("lassoSigExp.txt",header=T,sep="\t",check.names=F,row.names=1) #读取train输入文件
rt[,"futime"]=rt[,"futime"]/365
#使用train组构建COX模型
multiCox=coxph(Surv(futime, fustat) ~ ., data = rt)
multiCox=step(multiCox,direction = "both")
multiCoxSum=summary(multiCox)
#输出模型相关信息
outTab=data.frame()
outTab=cbind(
coef=multiCoxSum$coefficients[,"coef"],
HR=multiCoxSum$conf.int[,"exp(coef)"],
HR.95L=multiCoxSum$conf.int[,"lower .95"],
HR.95H=multiCoxSum$conf.int[,"upper .95"],
pvalue=multiCoxSum$coefficients[,"Pr(>|z|)"])
outTab=cbind(id=row.names(outTab),outTab)
write.table(outTab,file="multiCox.xls",sep="\t",row.names=F,quote=F)
#绘制森林图
pdf(file="forest.pdf",
width = 8, #图片的宽度
height = 5, #图片的高度
)
ggforest(multiCox,
main = "Hazard ratio",
cpositions = c(0.02,0.22, 0.4),
fontsize = 0.7,
refLabel = "reference",
noDigits = 2)
dev.off()
#输出train组风险文件
riskScore=predict(multiCox,type="risk",newdata=rt) #利用train得到模型预测train样品风险
coxGene=rownames(multiCoxSum$coefficients)
coxGene=gsub("`","",coxGene)
outCol=c("futime","fustat",coxGene)
medianTrainRisk=median(riskScore)
risk=as.vector(ifelse(riskScore>medianTrainRisk,"high","low"))
write.table(cbind(id=rownames(cbind(rt[,outCol],riskScore,risk)),cbind(rt[,outCol],riskScore,risk)),
file="riskTrain.txt",
sep="\t",
quote=F,
row.names=F)
####################下面是验证集的风险文件输出#################
#输出test组风险文件
rtTest=read.table("test.txt",header=T,sep="\t",check.names=F,row.names=1) #读取test输入文件
rtTest[,"futime"]=rtTest[,"futime"]/365
riskScoreTest=predict(multiCox,type="risk",newdata=rtTest) #利用train得到模型预测test样品风险
riskTest=as.vector(ifelse(riskScoreTest>medianTrainRisk,"high","low"))
write.table(cbind(id=rownames(cbind(rtTest[,outCol],riskScoreTest,riskTest)),cbind(rtTest[,outCol],riskScore=riskScoreTest,risk=riskTest)),
file="riskTest.txt",
sep="\t",
quote=F,
row.names=F)
6.5 风险图形绘制
风险生存曲线
library(survival)
#绘制train组生存曲线
rt=read.table("riskTrain.txt",header=T,sep="\t")
diff=survdiff(Surv(futime, fustat) ~risk,data = rt)
pValue=1-pchisq(diff$chisq,df=1)
pValue=signif(pValue,4)
pValue=format(pValue, scientific = TRUE)
fit <- survfit(Surv(futime, fustat) ~ risk, data = rt)
summary(fit) #查看五年生存率
pdf(file="survivalTrain.pdf",width=5.5,height=5)
plot(fit,
lwd=2,
col=c("red","blue"),
xlab="Time (year)",
ylab="Survival rate",
main=paste("Survival curve (p=", pValue ,")",sep=""),
mark.time=T)
legend("topright",
c("high risk", "low risk"),
lwd=2,
col=c("red","blue"))
dev.off()
#绘制test组生存曲线
rt=read.table("riskTest.txt",header=T,sep="\t")
diff=survdiff(Surv(futime, fustat) ~risk,data = rt)
pValue=1-pchisq(diff$chisq,df=1)
pValue=signif(pValue,4)
pValue=format(pValue, scientific = TRUE)
fit <- survfit(Surv(futime, fustat) ~ risk, data = rt)
summary(fit) #查看五年生存率
pdf(file="survivalTest.pdf",width=5.5,height=5)
plot(fit,
lwd=2,
col=c("red","blue"),
xlab="Time (year)",
ylab="Survival rate",
main=paste("Survival curve (p=", pValue ,")",sep=""),
mark.time=T)
legend("topright",
c("high risk", "low risk"),
lwd=2,
col=c("red","blue"))
dev.off()
风险ROC曲线
library(survivalROC)
rt=read.table("riskTrain.txt",header=T,sep="\t",check.names=F,row.names=1)
pdf(file="rocTrain.pdf",width=6,height=6)
par(oma=c(0.5,1,0,1),font.lab=1.5,font.axis=1.5)
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore,
predict.time =1, method="KM")
plot(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col='red',
xlab="False positive rate", ylab="True positive rate",
main=paste("ROC curve (", "AUC = ",round(roc$AUC,3),")"),
lwd = 2, cex.main=1.3, cex.lab=1.2, cex.axis=1.2, font=1.2)
abline(0,1)
dev.off()
rt=read.table("riskTest.txt",header=T,sep="\t",check.names=F,row.names=1)
pdf(file="rocTest.pdf",width=6,height=6)
par(oma=c(0.5,1,0,1),font.lab=1.5,font.axis=1.5)
roc=survivalROC(Stime=rt$futime, status=rt$fustat, marker = rt$riskScore,
predict.time =1, method="KM")
plot(roc$FP, roc$TP, type="l", xlim=c(0,1), ylim=c(0,1),col='red',
xlab="False positive rate", ylab="True positive rate",
main=paste("ROC curve (", "AUC = ",round(roc$AUC,3),")"),
lwd = 2, cex.main=1.3, cex.lab=1.2, cex.axis=1.2, font=1.2)
abline(0,1)
dev.off()