自备的GO库,使用R clusterProfiler做GO富集
非模式物种和数据库中没有的物种要做GO分析怎么办呢?现自己准备一个文件作为数据库,这个文件怎么准备,见非模式物种如何准备富集数据库(一)。
GO富集结果进行图形绘制
library(clusterProfiler)
#事先准备好的
go <- read.delim(go,header=TRUE,sep='\t')
给大家展示下准备的文件内容
> head(go)
gene_id go_id go_ontology go_term
1 ENSCHIG00000000039 GO:0006810 BP transport
2 ENSCHIG00000000039 GO:0055085 BP transmembrane transport
3 ENSCHIG00000000039 GO:0008150 BP biological_process
4 ENSCHIG00000000039 GO:0051179 BP localization
5 ENSCHIG00000000039 GO:0051234 BP establishment of localization
6 ENSCHIG00000000039 GO:0016021 CC integral component of membrane
将准备的文件按照BP、CC、MF三部分生成对应的表格
term2gene_bp <- go[go[,'go_ontology']=='BP',c('go_id','gene_id')] #将属于BP的内容提出,第一列为go_id 第二列为gene_id
term2name_bp <- go[go[,'go_ontology']=='BP',c('go_id','go_term')] #与上一样,第一列为go_id 第二列为go_term
term2gene_cc <- go[go[,'go_ontology']=='CC',c('go_id','gene_id')]
term2name_cc <- go[go[,'go_ontology']=='CC',c('go_id','go_term')]
term2gene_mf <- go[go[,'go_ontology']=='MF',c('go_id','gene_id')]
term2name_mf <- go[go[,'go_ontology']=='MF',c('go_id','go_term')]
生成的对应内容我们查看一下bp部分
> head(term2gene_bp)
go_id gene_id
1 GO:0006810 ENSCHIG00000000039
2 GO:0055085 ENSCHIG00000000039
3 GO:0008150 ENSCHIG00000000039
4 GO:0051179 ENSCHIG00000000039
5 GO:0051234 ENSCHIG00000000039
46 GO:0008152 ENSCHIG00000000041
> head(term2name_bp)
go_id go_term
1 GO:0006810 transport
2 GO:0055085 transmembrane transport
3 GO:0008150 biological_process
4 GO:0051179 localization
5 GO:0051234 establishment of localization
46 GO:0008152 metabolic process
#读入需要做富集的基因列表,此文件有两列
clus<-read.delim(clustergene,header=T,sep='\t')
clustergene <- as.character(clus[,1]) #第一列为gene_id
genename_vt <- as.character(clus$gene_name) #第二列为gene_name
查看下需要进行富集的基因id
> head(clustergene)
[1] "ENSCHIG00000013575" "ENSCHIG00000005295" "ENSCHIG00000022242"
[4] "ENSCHIG00000023002" "ENSCHIG00000025211" "ENSCHIG00000019483"
将gene id作为gene name的名称,
names(genename_vt)<-clustergene
查看下生成的内容,这个向量用于后边生成GO富集结果中gene id替换为gene name使用
> head(genename_vt)
ENSCHIG00000013575 ENSCHIG00000005295 ENSCHIG00000022242
"MFGE8" "ENSCHIG00000005295" "CD109"
ENSCHIG00000023002 ENSCHIG00000025211 ENSCHIG00000019483
"REV3L" "KITLG" "TXNDC11"
backgene <- as.character(read.delim(diffresult,header=TRUE,sep=',')[,1]) #读入背景基因列表,取出背景基因的id
#进行富集
BPenrich <- enricher(gene=clustergene,universe=backgene,TERM2GENE=term2gene_bp,TERM2NAME=term2name_bp,pAdjustMethod='BH',pvalueCutoff=1,qvalueCutoff=1)
CCenrich <- enricher(gene=clustergene,universe=backgene,TERM2GENE=term2gene_cc,TERM2NAME=term2name_cc,pAdjustMethod='BH',pvalueCutoff=1,qvalueCutoff=1)
MFenrich <- enricher(gene=clustergene,universe=backgene,TERM2GENE=term2gene_mf,TERM2NAME=term2name_mf,pAdjustMethod='BH',pvalueCutoff=1,qvalueCutoff=1)
查看下富集结果
> head(BPenrich)
ID Description
GO:0071702 GO:0071702 organic substance transport
GO:0006511 GO:0006511 ubiquitin-dependent protein catabolic process
GO:0019941 GO:0019941 modification-dependent protein catabolic process
GO:0043632 GO:0043632 modification-dependent macromolecule catabolic process
GO:0045454 GO:0045454 cell redox homeostasis
GO:0033036 GO:0033036 macromolecule localization
GeneRatio BgRatio pvalue p.adjust qvalue
GO:0071702 10/137 147/4241 0.01955833 0.4882532 0.4817169
GO:0006511 4/137 33/4241 0.02062225 0.4882532 0.4817169
GO:0019941 4/137 33/4241 0.02062225 0.4882532 0.4817169
GO:0043632 4/137 33/4241 0.02062225 0.4882532 0.4817169
GO:0045454 4/137 35/4241 0.02511630 0.4882532 0.4817169
GO:0033036 9/137 133/4241 0.02702033 0.4882532 0.4817169
geneID
GO:0071702 ENSCHIG00000020545/ENSCHIG00000021348/ENSCHIG00000018170/ENSCHIG00000014240/ENSCHIG00000010014/ENSCHIG00000012338/ENSCHIG00000022004/ENSCHIG00000025765/ENSCHIG00000016192/ENSCHIG00000007220
GO:0006511 ENSCHIG00000019061/ENSCHIG00000018162/ENSCHIG00000022567/ENSCHIG00000015198
GO:0019941 ENSCHIG00000019061/ENSCHIG00000018162/ENSCHIG00000022567/ENSCHIG00000015198
GO:0043632 ENSCHIG00000019061/ENSCHIG00000018162/ENSCHIG00000022567/ENSCHIG00000015198
GO:0045454 ENSCHIG00000019483/ENSCHIG00000022095/ENSCHIG00000010625/ENSCHIG00000021009
GO:0033036 ENSCHIG00000020545/ENSCHIG00000021348/ENSCHIG00000018170/ENSCHIG00000014240/ENSCHIG00000010014/ENSCHIG00000012338/ENSCHIG00000022004/ENSCHIG00000025765/ENSCHIG00000016192
Count
GO:0071702 10
GO:0006511 4
GO:0019941 4
GO:0043632 4
GO:0045454 4
GO:0033036 9
对三部分富集的内容结果进行格式调整,将第六列名字由 p.adjust改为padj,去掉qvalue列,并为每部分增加标签BP、MF、CC三类,最后将三部分进行合并为GOenrich,这样调整是因为我需要对这结果进行下游分析需要的格式,大家可以不需要进行调整,但是要把这三部分进行一个标注,以便区分。
BPenrich <- as.data.frame(BPenrich)
names(BPenrich)[6] <- 'padj'
BPenrich <- subset(BPenrich,select=-qvalue)
Category <- rep('BP',time=nrow(BPenrich))
BPenrich <- cbind(Category,BPenrich)
CCenrich <- as.data.frame(CCenrich)
names(CCenrich)[6] <- 'padj'
CCenrich <- subset(CCenrich,select=-qvalue)
Category <- rep('CC',time=nrow(CCenrich))
CCenrich <- cbind(Category,CCenrich)
MFenrich <- as.data.frame(MFenrich)
names(MFenrich)[6] <- 'padj'
MFenrich <- subset(MFenrich,select=-qvalue)
Category <- rep('MF',time=nrow(MFenrich))
MFenrich <- cbind(Category,MFenrich)
GOenrich <- rbind(BPenrich,CCenrich,MFenrich)
将geneID列中对gene_id 改为对应对gene_name
gene_list <- strsplit(GOenrich$geneID,split='/')
geneName <- unlist(lapply(gene_list,FUN=function(x){paste(genename_vt[x],collapse='/')}))
GOenrich <- data.frame(GOenrich[,1:8],geneName,GOenrich[,9,drop=F])
#选出显著部分内容,以padj<0.05为准
Significant <- subset(GOenrich,padj<0.05)
#保存富集结果用于后续绘图
write.table(GOenrich,file=paste('sample','_GOenrich.txt',sep=''),sep='\t',quote=F,row.names=F)
write.table(Significant,file=paste('sample','_GOenrich_significant.txt',sep=''),sep='\t',quote=F,row.names=F)
enricher和 enrichGO的区别
对于有数据库对非模式物种使用enrichGO
对于没有数据库,自己构建对数据使用enricher
enricher package:clusterProfiler R Documentation
enricher
Description:
A universal enrichment analyzer
Usage:
enricher(gene, pvalueCutoff = 0.05, pAdjustMethod = "BH", universe,
minGSSize = 10, maxGSSize = 500, qvalueCutoff = 0.2, TERM2GENE,
TERM2NAME = NA)
Arguments:
gene: a vector of gene id
pvalueCutoff: pvalue cutoff
pAdjustMethod: one of "holm", "hochberg", "hommel", "bonferroni", "BH",
"BY", "fdr", "none"
universe: background genes
minGSSize: minimal size of genes annotated for testing
maxGSSize: maximal size of genes annotated for testing
qvalueCutoff: qvalue cutoff
TERM2GENE: user input annotation of TERM TO GENE mapping, a data.frame
of 2 column with term and gene
TERM2NAME: user input of TERM TO NAME mapping, a data.frame of 2 column
with term and name
enrichGO package:clusterProfiler R Documentation
GO Enrichment Analysis of a gene set. Given a vector of genes, this
function will return the enrichment GO categories after FDR control.
Description:
GO Enrichment Analysis of a gene set. Given a vector of genes,
this function will return the enrichment GO categories after FDR
control.
Usage:
enrichGO(gene, OrgDb, keytype = "ENTREZID", ont = "MF",
pvalueCutoff = 0.05, pAdjustMethod = "BH", universe, qvalueCutoff = 0.2,
minGSSize = 10, maxGSSize = 500, readable = FALSE)
Arguments:
gene: a vector of entrez gene id.
OrgDb: OrgDb
keytype: keytype of input gene
ont: One of "MF", "BP", and "CC" subontologies.
pvalueCutoff: Cutoff value of pvalue.
pAdjustMethod: one of "holm", "hochberg", "hommel", "bonferroni", "BH",
"BY", "fdr", "none"
universe: background genes
qvalueCutoff: qvalue cutoff
minGSSize: minimal size of genes annotated by Ontology term for
testing.
maxGSSize: maximal size of genes annotated for testing
readable: whether mapping gene ID to gene Name