题目来源:https://mp.weixin.qq.com/s/jzDD05M5AuhCkiavkoLiHg
- 对GSE20685的所有基因做生存分析(表达量中位值分组),获取统计学显著差异的基因列表。
2.1 批量生存分析,获取统计学显著差异的基因列表。
rm(list=ls())
options(stringsAsFactors = F)
options(warn = -1)
library(AnnoProbe)
gset<-geoChina("GSE20685")
eSet <- exprs(gset[[1]])
pheno <- pData(gset[[1]])
dim(eSet)
#[1] 54627 327
dim(pheno)
#[1] 327 63
eSet[1:4,1:4]
# GSM519117 GSM519118 GSM519119 GSM519120
#1007_s_at 11.995510 12.042242 11.903921 11.905713
#1053_at 9.440330 9.329414 8.893978 9.577525
#117_at 7.907728 7.991689 8.163261 9.160542
#121_at 9.007045 8.976325 8.728571 8.665711
##表达矩阵行名是探针名,需要转换为基因名。
checkGPL(gset[[1]]@annotation)
#[1] TRUE
e2g <- idmap(gset[[1]]@annotation)
eSet <- filterEM(eSet,e2g)
eSet[1:4,1:4]
# GSM519117 GSM519118 GSM519119 GSM519120
#ZZZ3 10.562788 10.68748 10.209871 10.48307
#ZZEF1 9.856933 10.46540 9.873843 10.07197
#ZYX 10.672518 10.48420 11.005133 10.59319
#ZYG11B 11.059366 11.29188 11.039351 11.49405
查看临床信息
characteristics_ch1.3
是存活情况characteristics_ch1.4
是存活时间(存疑)
dat <- pheno[,c("characteristics_ch1.3","characteristics_ch1.4")]
library(stringr)
colnames(dat) <- c("status","OS.time")
dat$status <- as.numeric(unlist(lapply(str_split(dat$status,":"),function(x) {return(x[2])})))
dat$OS.time <- as.numeric(unlist(lapply(str_split(dat$OS.time,":"),function(x) {return(x[2])})))
table(dat$status)
# 0 1
#244 83
boxplot(dat$OS.time~dat$status)
死亡病例对应的持续回访时间更低。
library(survival)
cg <- array(dim = nrow(eSet))
survdata <- dat
my.surv <- Surv(survdata$OS.time,survdata$status==0)
#library("survminer")
dim(eSet)
for (i in 1:nrow(eSet)) {
# i <- 1
# print(i)
gene_exprs <- eSet[i,]
gene_exprs <- as.data.frame(t(gene_exprs))
gene_exprs[,1] <- as.numeric(gene_exprs[,1])
gene_exprs$g <-
ifelse(gene_exprs[,1]>=median(gene_exprs[,1]),'high','low')
# print(table(gene_exprs$g))
gene_exprs$sample_name <- rownames(gene_exprs)
gene_surv <- survdata
gene_surv$sample_name <- rownames(gene_surv)
gene_surv <- merge(gene_surv,gene_exprs,by='sample_name')
# kmfit <- survfit(my.surv~gene_surv$g,data = gene_surv)
kmdif <- survdiff(my.surv~gene_surv$g,data = gene_surv)
# ggsurvplot(kmfit,conf.int = F,pval = T)
p.value <- 1 - pchisq(kmdif$chisq, length(kmdif$n) -1)
cg[i] <- (p.value < 0.05)
}
table(cg)
#cg
#FALSE TRUE
#18386 1802
##有1802个基因统计学差异显著。
surv_diff_genes <- rownames(eSet)[cg]
save(surv_diff_genes,file = "surv_diff_genes.rdata")
2.2 对生存分析显著的基因列表做富集分析
1)获取列表基因的ENTREZID
rm(list=ls())
load("surv_diff_genes.rdata")
surv.diff.genes <- as.data.frame(surv_diff_genes)
colnames(surv.diff.genes) <- c("SYMBOL")
library(clusterProfiler)
library(org.Hs.eg.db)
df <- bitr(surv.diff.genes$SYMBOL,
fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db)
genelist <- df$ENTREZID
length(genelist)
#[1] 1762
1)GO分析
go_enrich_results <- lapply( c('BP','MF','CC') , function(ont) {
cat(paste('Now process ',ont ))
ego <- enrichGO(gene = genelist,
OrgDb = org.Hs.eg.db,
ont = ont ,
pAdjustMethod = "BH",
pvalueCutoff = 0.99,
qvalueCutoff = 0.99,
readable = TRUE)
print( head(ego) )
dotplot(ego,title=paste0('dotplot_',ont)) %>% print()
return(ego)
})
save(go_enrich_results,file = 'go_enrich_results.Rdata')
2)KEGG分析
library(clusterProfiler)
kk <- enrichKEGG(gene = genelist,
organism = 'hsa',
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
head(kk)[,1:6]
# ID Description GeneRatio BgRatio pvalue p.adjust
#hsa04145 hsa04145 Phagosome 25/658 152/7978 0.0006218799 0.1940265
#hsa04974 hsa04974 Protein digestion and absorption 16/658 95/7978 0.0044579625 0.6954421
#hsa04657 hsa04657 IL-17 signaling pathway 15/658 94/7978 0.0095703403 0.8056927
#hsa00650 hsa00650 Butanoate metabolism 6/658 28/7978 0.0241405319 0.8056927
#hsa05130 hsa05130 Pathogenic Escherichia coli infection 25/658 202/7978 0.0259336441 0.8056927
#hsa03010 hsa03010 Ribosome 20/658 153/7978 0.0260288000 0.8056927
enrichKK=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
可视化
dotplot(enrichKK,color = "pvalue")
barplot(enrichKK,showCategory=20,color = "pvalue")
cnetplot(enrichKK, categorySize="pvalue", colorEdge = TRUE)
emapplot(enrichKK,color = "pvalue")
heatplot(enrichKK,showCategory = 20)