GSEA:(Gene Set Enrichment Analysis)基因集富集分析
基因集富集分析(GSEA)和基因富集分析(GO/KEGG)的区别:
基因富集分析是对已注释基因在GO数据库或KEGG数据库中找各基因参与的通路,可以知道哪些基因参与了哪些通路,但这些基因是否上调/下调参与是不能知道的。即使在画图的时候分别富集上调和下调的基因参与通路,也有可能出现同一个通路中既有上调的基因,也有下调的基因,如下图。所以GO/KEGG分析是没有考虑基因表达量的,只考虑有无参与某通路,换一句话说,该通路是否被抑制或被激活是无从得知的。(最后附上这个图是怎么画的)
GSEA分析是一种基于基因集的富集分析方法,基于基因表达数据的大小进行排序。然后判断每个基因集内的基因是否富集于表型相关度排序后基因列表的上部或下部,从而判断此基因集内基因的协同变化对表型变化的影响。(看不太懂,没关系,通过代码你能更好的理解这个分析的原理)
附上一些参考资料:
- 大致理解基因集富集分析:https://zhuanlan.zhihu.com/p/259894818
- MSigDB的人类database:https://docs.gsea-msigdb.org/#MSigDB/Release_Notes/MSigDB_2023.2.Hs/
- msigdbr 包提供多个物种的MSigDB数据:https://www.jianshu.com/p/f2febb3123d8
- 基因矩阵转置文件格式(* .gmt):https://mp.weixin.qq.com/s?__biz=MzI4ODE0NTE3OA==&mid=2649200588&idx=1&sn=3a6e00669fd853e821f6748c642c464f&chksm=f3d1ddb9c4a654af01da73ed2befb8980379a4aa4fcc910775a7b775dda2435d1e1b06c6a323&scene=21%22%20%5Cl%20%22wechat_redirect%22%20%5Ct%20%22_blank
GSEA分析代码实现
前期准备 - edgeR 获取foldchange
library(edgeR)
library(statmod)
# 设置工作路径,导入文件时直接默认处于此目录,不需要每次输绝对路径
setwd("D:\\data")
# 导入样本表达量丰度文件(我这里导入是的reads count。类似第一列是geneID,后续几列是各样本的reads count)
samples_abundance <- read.delim("D:/data/samples_abundance.esp")
# 设置分组文件
group_file <- data.frame(group=c("control","control","control","control","patient","patient","patient","patient"),
sample=c("DHH_1","ZSQ_1","ZZX_1","YYF_1","LXD_4","SXC_4","MYF_4","WFZ_4"))
# 设置保存输出文件前缀
outname = "all_gene_edgeRdiff"
# 获取reads count所在列
count_data <- samples_abundance[,c(4:11)]
# 列名设置为gene_ID
rownames(count_data) <- samples_abundance$gene_ID
# edgeR分析(可以直接照搬)
dgelist <- DGEList(counts = count_data, group = group_file$group)
keep <- rowSums(cpm(dgelist) > 1 ) >= 2 # 过滤一些低表达的基因
dgelist <- dgelist[keep,keep.lib.sizes = FALSE]
dgelist_norm <- calcNormFactors(dgelist, method = 'TMM')
design <- model.matrix(~group_file$group)
dge <- estimateDisp(dgelist_norm, design, robust = TRUE)
fit <- glmFit(dge, design, robust = TRUE)
lrt <- topTags(glmLRT(fit), n = nrow(dgelist$counts))
mode='glmQLFTest'
#排序并标出UP/down
lrt<-lrt$table
gene_diff <- lrt[order(lrt$FDR, lrt$logFC, decreasing = c(FALSE, TRUE)), ]
gene_diff[which(gene_diff$logFC >= 1 & gene_diff$PValue < 0.05),'sig'] <- 'up'
gene_diff[which(gene_diff$logFC <= -1 & gene_diff$PValue < 0.05),'sig'] <- 'down'
gene_diff[which(abs(gene_diff$logFC) <= 1 | gene_diff$PValue >= 0.05),'sig'] <- 'non-significant'
gene_diff$gene_ID<- rownames(gene_diff)
#输出差异基因总表
write.table(gene_diff, paste(outname,'.txt',sep=""), sep = '\t', quote = FALSE,row.names = FALSE)
#输出显著性差异基因总表
gene_diff_select <- subset(gene_diff, sig %in% c('up', 'down'))
write.table(gene_diff_select, file = paste(outname,'.select.txt',sep=""), sep = '\t',row.names = FALSE, quote = FALSE)
## 最后你想用差异基因总表或显著性差异基因总表都可以
得到的文件如下图:
正式开始GSEA分析
library(fgsea)
library(tidyverse)
library(org.Hs.eg.db)
library(msigdbr)
setwd("D:\\data")
# 刚刚edgeR得到的显著性差异基因总表
all_gene_edgeRdiff.select <- read.delim("D:/data/all_gene_edgeRdiff.select.txt")
##### 获取entrez id
# entrez id是和ENSEMBL的ID一一对应的,但是ENSEMBL只有“ENSG00000196735”,没有后面的“.13”,如果有“.13”是无法找到的,要先处理一下数据
tx_gene_id<-separate(all_gene_edgeRdiff.select,col = transcript_ID,into = c("gene_ID","a"),sep = "[.]")
DEG.gene_symbol = as.character(tx_gene_id$gene_ID)
# 我这里是从人类db找的ENTREZID
entrez_id<-bitr(geneID = DEG.gene_symbol,fromType = "ENSEMBL",toType = "ENTREZID",OrgDb = org.Hs.eg.db)
colnames(entrez_id)<-c("gene_ID","ENTREZID")
# 将ENTREZID和foldchange合并为一个文件
entrez_gene<-merge(entrez_id,tx_gene_id_il,by="gene_ID")
###### rank文件准备
# fgsea需要接收foldchange,但它不能为list类型,只能为向量。且每个foldchange的行名为ENTREZID,然后从大到小排序
rank_df<-entrez_gene$logFC
names(rank_df)<-entrez_gene$ENTREZID
rank_df<-sort(rank_df,decreasing = TRUE)
###### pathway文件准备
# fgsea需要接收pathway文件,格式为list,列名为各通路,元素为参与该通路的所有ENTREZID,以character格式存在,这些通路和ENTREZID可以从MSigDB找到。
msigdb.hs = msigdbr(species = "Homo sapiens",category = "C5",subcategory = "BP")
# 我这里选择的是人类-C5类(GO库)-BP通路。要知道还有什么类,可以看上面参考资料的2和3
# 这个数据库的格式还不能直接用,需要调整为最后的pathway形式
msigdb.hs_index<-msigdb.hs[!duplicated(msigdb.hs[,3]),]
pathway<-list()
for(i in 1:nrow(msigdb.hs_index)){
pathway_name<-c(msigdb.hs_index[i,3])
path_list<-subset(msigdb.hs,msigdb.hs$gs_name==pathway_name)
e_id_list<-as.character(path_list$entrez_gene)
pathway[[i]]<-e_id_list
}
names(pathway)<-msigdb.hs_index$gs_name
# 后续其他数据分析可以直接导入使用,不需要再跑一次
sink("GO_BP_pathway.txt")
pathway
sink()
# GESA分析
fgseaRes <- fgsea(pathways = pathway,
stats = rank_df,
minSize=15,
maxSize=500,
nperm=100000)
head(fgseaRes[order(pval), ])
df_fgseaRes = data.frame(lapply(fgseaRes, as.character), stringsAsFactors=FALSE)
write.table(df_fgseaRes,file = "fgseaRes.txt",sep="\t",quote = FALSE,row.names = FALSE)
可视化
# plot the most significantly enriched pathway
plotEnrichment(pathway[[head(fgseaRes[order(pval), ], 1)$pathway]],
rank_df) + labs(title=head(fgseaRes[order(pval), ], 1)$pathway)
该图说明:显著富集的通路为生物之间的种间相互作用,并且参与基因大部分是上调的,说明该通路大概率被激活
# 上调/下调前15个pathway
topPathwaysUp <- fgseaRes[ES > 0][head(order(pval), n=15), pathway]
topPathwaysDown <- fgseaRes[ES < 0][head(order(pval), n=15), pathway]
topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
plotGseaTable(pathway[topPathways], rank_df, fgseaRes,
gseaParam = 0.5)
该图说明:显著富集上调的通路有生物之间的种间相互作用、响应其他物种、细菌、脂质、防御响应等等;显著富集下调的通路有产生免疫球蛋白、细胞附着调控、免疫调控响应等等。
现在再看“GSEA分析是一种基于基因集的富集分析方法,基于基因表达数据的大小进行排序。然后判断每个基因集内的基因是否富集于表型相关度排序后基因列表的上部或下部,从而判断此基因集内基因的协同变化对表型变化的影响。”这段话,有没有好理解一些?
GO分析可视化
setwd("D:\\data\\edgeR_GO")
library(clusterProfiler)
library(topGO)
library(Rgraphviz)
library(pathview)
library(org.Hs.eg.db)
library(readxl)
library(dplyr)
library(ggplot2)
library(stringr)
library(AnnoProbe)
library(tinyarray)
library(tidyverse)
## 跑完edgeR得到上调/下调的基因,分别做两个表,仅一列,第一行为表头“gene_ID”,后续为上调/下调的基因名
up_gene_ID <- read.csv("D:/data/gland_blood/total/edgeR_GO/up_gene_ID.txt", sep="")
down_gene_ID <- read.csv("D:/data/gland_blood/total/edgeR_GO/down_gene_ID.txt", sep="")
# up_gene做GO分析
UP_gene_ID<-separate(up_gene_ID,col = gene_ID,into = c("gene_id","a"),sep = '[.]')
up_DEG.gene_symbol = as.character(UP_gene_ID$gene_id)
up_DEG.gene_symbol = na.omit(up_DEG.gene_symbol)
up_DEG.gene_symbol = unique(up_DEG.gene_symbol)
up_DEG.entrez_id = mapIds(x = org.Hs.eg.db,
keys = up_DEG.gene_symbol,
keytype = "ENSEMBL",
column = "ENSEMBL","SYMBOL","GENENAME")
# GO分析
up_erich.go.BP = enrichGO(gene = up_DEG.entrez_id,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "BP",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
readable=T)
up_BP <- as.data.frame(up_erich.go.BP)
up_erich.go.CC = enrichGO(gene = up_DEG.entrez_id,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "CC",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
readable=T)
up_CC <- as.data.frame(up_erich.go.CC)
up_erich.go.MF = enrichGO(gene = up_DEG.entrez_id,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "MF",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
readable=T)
up_MF <- as.data.frame(up_erich.go.MF)
# down_gene做GO分析
DOWN_gene_ID<-separate(down_gene_ID,col = gene_ID,into = c("gene_id","a"),sep = '[.]')
down_DEG.gene_symbol = as.character(DOWN_gene_ID$gene_id)
down_DEG.gene_symbol = na.omit(down_DEG.gene_symbol)
down_DEG.entrez_id = mapIds(x = org.Hs.eg.db,
keys = down_DEG.gene_symbol,
keytype = "ENSEMBL",
column = "ENSEMBL","SYMBOL","GENENAME")
# GO分析
down_erich.go.BP = enrichGO(gene = down_DEG.entrez_id,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "BP",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
readable=T)
down_BP <- as.data.frame(down_erich.go.BP)
down_erich.go.CC = enrichGO(gene = down_DEG.entrez_id,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "CC",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
readable=T)
down_CC <- as.data.frame(down_erich.go.CC)
down_erich.go.MF = enrichGO(gene = down_DEG.entrez_id,
OrgDb = org.Hs.eg.db,
keyType = "ENSEMBL",
ont = "MF",
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH",
readable=T)
down_MF <- as.data.frame(down_erich.go.MF)
# 合并数据
down_BP$sig<-c("down")
down_CC$sig<-c("down")
down_MF$sig<-c("down")
up_BP$sig<-c("up")
up_CC$sig<-c("up")
up_MF$sig<-c("up")
# 根据每个通路中有多少gene参与进行排序,选择前22个通路
down_BP_22<-arrange(down_BP,-Count)
down_CC_22<-arrange(down_CC,-Count)
down_MF_22<-arrange(down_MF,-Count)
up_BP_22<-arrange(up_BP,-Count)
up_CC_22<-arrange(up_CC,-Count)
up_MF_22<-arrange(up_MF,-Count)
data<-rbind(down_BP_22[1:20,],down_CC[1:20,],down_MF_22[1:20,],up_BP_22[1:20,],up_CC_22[1:20,],up_MF_22[1:20,])
data_df <- data %>%
mutate(p2 = ifelse(sig == "up", Count, -Count)) %>%
arrange(sig,Count)
data_df$ID = factor(data_df$ID,levels = unique(data_df$ID),ordered = T)
data_df<-arrange(data_df,desc(sig),-Count)
# 画图
tmp = with(data_df, labeling::extended(range(p2)[1], range(p2)[2], m = 5));tmp
lm = tmp[c(1,length(tmp))];lm
x_lab<-data_df$Description
ggplot(data_df, aes(x=Description, y=p2)) +
geom_segment( aes(x=ID,xend=ID,y=0, yend=p2, color=sig), size=5, alpha=0.9) +
theme_light() +
theme(
panel.border = element_blank(),
) +
xlab("") +
ylab("num. of gene")+ylim(lm)+
scale_y_continuous(breaks = tmp,
labels = abs(tmp))+
coord_flip()+ # 反转x/y轴
scale_x_discrete(labels=rev(x_lab)) # 使通路名为x轴(反转后为Y轴)这里你需要认真对一下,有时候顺序是不对的,可以自行调整x_lab顺序