对于没有Org.Db的非模式生物,由于某些原因不制作Org.Db,然后利用clusterProfiler做富集
数据格式
## Annotation_GO.xlsx
Gene.ID Biological.Pathway BP.Description Molecular.Function MF.Description Cellular.Component CC.Description
gene_1 GO:0007283 spermatogenesis GO:0003677 DNA binding GO:0000786//GO:0005634 nucleosome//nucleus
## Annotation_KO.xlsx
Gene.ID 1st-level(pathway) 2nd-level(pathway) 3rd-level(pathway) Pathway.id KO.id KO.Description EC
gene_1 Cellular Processes Cell growth and death Cell cycle ko04110 K02180 cell cycle arrest protein BUB3 -
## test_list.xlsx
gene_id FC
gene_1 1.85
数据准备
#coding:utf-8
library(openxlsx)
library(tidyr)
library(stringr)
library(dplyr)
getwd()
dir()
rm(list=ls())
#定义GO数据结构化函数
data_clean <-function(x,ont){
colnames(x) <- c("gene","GO_id","GO_name")
gterms <- x %>%
dplyr::select(gene, GO_id,GO_name) %>% na.omit()
gene2go <- data.frame(gene = character(),
GO_id = character(),
GO_name = character(),
Ont = character()
)
for (row in 1:nrow(gterms)){
gene_terms_id <- str_split(gterms[row,"GO_id"], "//", simplify = FALSE)[[1]]
gene_terms_name <- str_split(gterms[row,"GO_name"], "//", simplify = FALSE)[[1]]
gene_id <- gterms[row, "gene"][[1]]
tmp <- data.frame(gene = rep(gene_id, length(gene_terms_id)),
GO_id = gene_terms_id,
GO_name = gene_terms_name,
Ont = rep(ont, length(gene_terms_id))
)
gene2go <- rbind(gene2go, tmp)
}
return(gene2go)
}
# 载入数据
go_file <- read.xlsx("Annotation_GO.xlsx")
go_file[go_file=="--"]<-NA
go_bp <- go_file[,1:3]
# 按Ont类型格式化数据
## BP
go_bp <- go_file[,1:3]
gene2go_bp <- data_clean(go_bp,"BP")
go2gene_bp <- distinct(gene2go_bp[,c(2,1)])
go2name_bp <- distinct(gene2go_bp[,c(2,3)])
## MF
go_mf <- go_file[,c(1,4,5)]
gene2go_mf <- data_clean(go_mf,"MF")
go2gene_mf <- distinct(gene2go_mf[,c(2,1)])
go2name_mf <- distinct(gene2go_mf[,c(2,3)])
## CC
go_cc <- go_file[,c(1,6,7)]
gene2go_cc <- data_clean(go_cc,"CC")
go2gene_cc <- distinct(gene2go_cc[,c(2,1)])
go2name_cc <- distinct(gene2go_cc[,c(2,3)])
#保存关键数据
save(go2gene_bp,go2name_bp,go2gene_mf,go2name_mf,go2gene_cc,go2name_cc,file="XX.go.RData")
#富集时调用
#load(file = "XX.go.RData")
#Pathway数据格式化
rm(list=ls())
ko_file <- read.xlsx("Annotation_KO.xlsx")
ko_file[ko_file=="--"]<-NA
gene2ko <- ko_file[,c(1,5,4,3,2)]
colnames(gene2ko) <- c("gene","pathway_id","pathway_name_lv3","pathway_name_lv2","pathway_name_lv1")
ko2gene <- distinct(gene2ko[,c(2,1)])
ko2name <- distinct(gene2ko[,c(2,3)])
#保存关键数据
save(gene2ko,ko2gene,ko2name,file="XX.ko.RData")
#富集时调用
#load(file = "XX.ko.RData")
富集分析绘图
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(c("clusterProfiler"))
library(clusterProfiler)
library(openxlsx)
## __init__
rm(list = ls())
load(file = "XX.go.RData")
load(file = "AX.ko.RData")
## 导入数据
test_list <- read.xlsx("test_list.xlsx")
## 准备数据格式
test_gene <- test_list[,1]
fc_list <- test_list[,2]
names(fc_list)<- as.character(test_list[,1])
fc_list <- sort(fc_list,decreasing = T)
## GO/KO富集
enrich_go <- enricher(
test_gene, # 前景基因集
pvalueCutoff = 0.05, # p值阈值
pAdjustMethod = "fdr", # 调整p值校正方式。(one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none")
qvalueCutoff = 0.05, # q值阈值
TERM2GENE = go2gene_bp, # 修改以确定富集的Ont类型(go2gene_bp, go2gene_mf, go2gene_cc,ko2gene)
TERM2NAME = go2name_bp # 跟随上一行一起修改 (go2name_bp, go2name_mf, go2name_cc, ko2name)
#universe, # 背景基因集
#minGSSize = 10,
#maxGSSize = 500,
)
## 保存富集结果
write.csv(enrich_go@result,file="XXX.go.csv")
## 绘制条形图
plot_data <- enrich_go@result[1:10,c(2,9,5)]
library(ggplot2)
pp1=ggplot(plot_data, aes(x=reorder(plot_data$Description,plot_data$Count ),y=Count,fill=-1*log10(plot_data$pvalue)))
pbar=pp1+geom_bar(stat="identity")+theme_minimal()+coord_flip()
output_bar_pdf <- paste("XXX",".bar",".pdf",sep = '')
ggsave(pbar,file=output_bar_pdf,width=10,height=6)
## 绘制气泡图
library(ggplot2)
pp= ggplot(plot_data, aes(x=plot_data$pvalue,y=reorder(plot_data$Description,plot_data$pvalue)))
pbubble = pp + geom_point(aes(size=plot_data$Count,color=-1*log10(plot_data$pvalue))) + scale_colour_gradient(low="green",high = "red")+ theme_minimal()
output_bb_pdf <- paste("XXX",".bb",".pdf",sep = '')
ggsave(pbubble,file=output_bb_pdf,width=10,height=6)
## GSEA富集分析
gse_result <- GSEA(
fc_list,
exponent = 1,
eps = 1e-10,
pvalueCutoff = 1,
pAdjustMethod = "fdr",
TERM2GENE = go2gene_bp, # 修改以确定富集的Ont类型(go2gene_bp, go2gene_mf, go2gene_cc,ko2gene)
TERM2NAME = go2name_bp, # 跟随上一行一起修改 (go2name_bp, go2name_mf, go2name_cc, ko2name)
verbose = TRUE,
by = "fgsea"
#minGSSize = 10,
#maxGSSize = 500,
)
## 保存结果
write.csv(gse_result@result,file = "XXX.csv")
## 批量绘制GSEA图
for(i in 1:length(gse_result@result$ID)) {
id <- gse_result@result$ID[i]
gsefig<- gseaplot(gse_result,geneSetID = id)
output_gse_pdf <- paste("XXX",i,".",id,".gse",".pdf",sep = '')
ggsave(gsefig,file=output_gse_pdf,width=10,height=6)
}