本文大部分内容来源于网络,写此文章仅是为了自己后面查找方便,如有侵权,联系删除。
参考来自于:
自己构建物种包OrgDb,然后用clusterProfiler富集分析 - 简书 (jianshu.com)
构建物种的OrgDb教程
eggnog 网站信息http://eggnog-mapper.embl.de/, 可以直接提交蛋白序列,如果提交的为多个转录本的蛋白序列,预测结果为各个转录本的注释信息,需要自己将各个转录本的注释信息整合为单个基因的注释信息
1. 构建系统OrgDb数据库
Step1 加载需要的库,若无,则用install.packages()安装
library(clusterProfiler)
library(tidyverse)
library(stringr)
library(KEGGREST)
library(AnnotationForge)
library(clusterProfiler)
library(dplyr)
library(jsonlite)
library(purrr)
library(RCurl)
options(stringsAsFactors = F)
emapper <-read.table("your_eggnog_result",header = TRUE, sep = "\t")
#读入自己的基因组注释信息,直接提交到eggnog网站,如果是区分转录本进行的注释,需要将注释信息整合为基因的注释信息
#该文档主要是四列,包括query Preferred_name GOs KEGG_ko
emapper[emapper==""]<-NA #将缺省值替换为NA
step2 提取GO信息
提取emapper中的query列、Preferred_name列以及GOs列
gene_info <- emapper %>% dplyr::select(GID = query,GENENAME = Preferred_name) %>% na.omit()
gos <- emapper %>% dplyr::select(query, GOs) %>% na.omit()
构建一个空的gene2go数据框,为后面填充数据用
gene2go = data.frame(GID =character(),GO = character(),EVIDENCE = character())
对gene2go空数据框进行填充,循环的作用简单来说就单行变多行
for (row in 1:nrow(gos)) {
the_gid <- gos[row, "query"][[1]]
the_gos <- str_split(gos[row,"GOs"], ",", simplify = FALSE)[[1]]
df_temp <- data_frame(GID=rep(the_gid, length(the_gos)),
GO = the_gos,
EVIDENCE = rep("IEA", length(the_gos)))
gene2go <- rbind(gene2go, df_temp)}
将gene2go中GOs列中的“-”替换为NA,然后使用na.omit()删除含有NA的行
gene2go$GO[gene2go$GO=="-"]<-NA
gene2go<-na.omit(gene2go)
step3 提取KEGG信息
#把Emapper中的query列、KEGG_ko列提出出来
gene2ko <- emapper %>% dplyr::select(GID =query, Ko = KEGG_ko) %>% na.omit()
#将gene2ko的Ko列中的“Ko:”删除,不然后面找pathway会报错
gene2ko$Ko <- gsub("ko:","",gene2ko$Ko)
#下载KO的json文件并【放到当前文件夹下】,下载地址:https://www.genome.jp/kegg-bin/get_htext?ko00001
update_kegg <- function(json = "ko00001.json") {
pathway2name <- tibble(Pathway = character(), Name = character())
ko2pathway <- tibble(Ko = character(), Pathway = character())
kegg <- fromJSON(json)
for (a in seq_along(kegg[["children"]][["children"]])) {
A <- kegg[["children"]][["name"]][[a]]
for (b in seq_along(kegg[["children"]][["children"]][[a]][["children"]])) {
B <- kegg[["children"]][["children"]][[a]][["name"]][[b]]
for (c in seq_along(kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]])) {
pathway_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["name"]][[c]]
pathway_id <- str_match(pathway_info, "ko[0-9]{5}")[1]
pathway_name <- str_replace(pathway_info, " \\[PATH:ko[0-9]{5}\\]", "") %>% str_replace("[0-9]{5} ", "")
pathway2name <- rbind(pathway2name, tibble(Pathway = pathway_id, Name = pathway_name))
kos_info <- kegg[["children"]][["children"]][[a]][["children"]][[b]][["children"]][[c]][["name"]]
kos <- str_match(kos_info, "K[0-9]*")[,1]
ko2pathway <- rbind(ko2pathway, tibble(Ko = kos, Pathway = rep(pathway_id, length(kos))))}}}
save(pathway2name, ko2pathway, file = "kegg_info.RData")}
调用函数后在本地创建kegg_info.RData文件
update_kegg()
载入kegg_info.RData文件
load(file = "kegg_info.RData")
根据 gene2ko中的ko信息将ko2pathway中的pathway列提取出来
gene2pathway <- gene2ko %>% left_join(ko2pathway,by = "Ko") %>% dplyr::select(GID, Pathway) %>% na.omit()
在网站https://www.ncbi.nlm.nih.gov/taxonomy上查询物种信息
tax_id="4577"
genus="zea"
species="mays"
去除重复
gene2go <- unique(gene2go)
gene2go <- gene2go[!duplicated(gene2go),]
gene2ko <- gene2ko[!duplicated(gene2ko),]
gene2pathway <- gene2pathway[!duplicated(gene2pathway),]
构建OrgDb数据库
makeOrgPackage(gene_info=gene_info,
go=gene2go,
ko=gene2ko,
pathway=gene2pathway,
version="0.0.1", #版本
maintainer = "landuomayi<********@gmail.com>", #修改为你的名字和邮箱
author = "landuomayi<********@gmail.com>", #修改为你的名字和邮箱
outputDir = ".", #输出文件位置
tax_id=tax_id,
genus=genus,
species=species,
goTable="go")
2 进行culsterprofiler GO、KEGG分析
2.1 导入自己构建的 OrgDb
install.packages("org.Zmays.eg.db", repos=NULL, type="sources")
library(org.Zmays.eg.db)
columns(org.Zmays.eg.db)
2.2 读入自己的差异表达基因列表
gene_list <- rownames(DEGs) #读入自己的差异表达基因列表
2.3 KEGG分析
从 OrgDB 提取 Pathway 和基因的对应关系
pathway2gene <- AnnotationDbi::select(org.Zmays.eg.db,
keys = keys(org.Zmays.eg.db),
columns = c("Pathway","Ko")) %>%
na.omit() %>%
dplyr::select(Pathway, GID)
导入 Pathway 与名称对应关系
load("kegg_info.RData")
KEGG pathway 富集
kegg <- enricher(gene_list,
TERM2GENE = pathway2gene,
TERM2NAME = pathway2name,
pvalueCutoff = 1,
qvalueCutoff = 1,
pAdjustMethod = "BH",
minGSSize = 1)
kegg_results <- as.data.frame(kegg)
kegg_bar <-barplot(kegg_results, showCategory=20,color="pvalue", font.size=10) #绘制柱状图
ggsave("kegg_bar.png",kegg_bar, width = 16, height = 9, units = "in", dpi = 600)# 保存图片
kegg_dot <-dotplot(kegg_results, showCategory=20,color="pvalue", font.size=10) #绘制点状图
ggsave("kegg_dot.png",kegg_dot, width = 16, height = 9, units = "in", dpi = 600)# 保存图片
2.4 GO分析
分子功能(Molecular Function)
erich.go.MF <- enrichGO(gene=rownames(gene_deseq2),OrgDb = org.Zmays.eg.db,pvalueCutoff = 0.01,qvalueCutoff = 0.01,ont = "MF",keyType = "GID")
g_MF<-barplot(erich.go.MF,showCategory = 15)
ggsave("GO_MF.png",g_MF, width = 16, height = 9, units = "in", dpi = 600)
生物过程(Biological Process)
erich.go.BP <- enrichGO(gene=rownames(gene_deseq2),OrgDb = org.Zmays.eg.db,pvalueCutoff = 0.01,qvalueCutoff = 0.01,ont = "BP",keyType = "GID")
g_BP<-barplot(erich.go.BP,showCategory = 15)
ggsave("GO_BP.png",g_MF, width = 16, height = 9, units = "in", dpi = 600)
细胞组成(Cellular Component)
erich.go.CC <- enrichGO(gene=rownames(gene_deseq2),OrgDb = org.Zmays.eg.db,pvalueCutoff = 0.01,qvalueCutoff = 0.01,ont = "CC",keyType = "GID")
g_CC<-barplot(erich.go.CC,showCategory = 15)
ggsave("GO_CC.png",g_MF, width = 16, height = 9, units = "in", dpi = 600)