1、富集分析的背景知识 - 简书 (jianshu.com)
2、clusterprofile富集分析--上游分析 - 简书 (jianshu.com)
3、clusterprofile富集分析--下游可视化 - 简书 (jianshu.com)
4、clusterprofile富集分析--多组设计的富集及可视化 - 简书 (jianshu.com)
附:[R]基因ID转换:两个R包 - 简书 (jianshu.com)
clusterprofile包目前已经更新到版本4,主要提供基于ORA/GSEA算法的富集分析函数。关于富集的背景基因集,除了常见的GO、KEGG、WikiPathways富集分析,还可以是自定义的基因集。除此以外,系列包
ReactomePA
,DOSE
,meshes
可以方便得进行对Reactome、Disease ontology、Medical Subject Headings三类基因集进行ORA/GSEA富集分析
0、示例差异基因数据
- 差异基因列表:元素为差异倍数、name为ENTREZID基因ID、降序排序
data(geneList, package="DOSE")
str(geneList)
# Named num [1:12495] 4.57 4.51 4.42 4.14 3.88 ...
# - attr(*, "names")= chr [1:12495] "4312" "8318" "10874" "55143" ...
- 显著差异基因:ENTREZID基因ID为元素的字符串
gene <- names(geneList)[abs(geneList) > 2]
str(gene)
# chr [1:207] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" "9493" "1062" "3868" ...
1、指定基因集富集分析
1.1 GO
ORA:enrichGO {clusterProfiler}
相关重要参数有
ont
: One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
keyType
:差异基因的基因ID类型,默认为ENTREZID。因此如下示例没有单独设置;但是只有这里的GO富集分析的差异基因可以是其它类基因ID,下面其它的富集函数都只能是ENTREZID
readable
:返回的富集分析结果里,基因ID是否转换为SYMBOL
格式
library(clusterProfiler)
library(org.Hs.eg.db)
ego <- enrichGO(gene = gene,
universe = names(geneList),
OrgDb = org.Hs.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE)
t(ego@result[1,]) #查看结果第一行的示例
# GO:0005819
# ID "GO:0005819"
# Description "spindle"
# GeneRatio "26/201"
# BgRatio "299/11840"
# pvalue "6.490593e-12"
# p.adjust "1.908234e-09"
# qvalue "1.73538e-09"
# geneID "CDCA8/CDC20/KIF23/CENPE/ASPM/DLGAP5/SKA1/NUSAP1/TPX2/TACC3/NEK2/CDK1/MAD2L1/KIF18A/BIRC5/KIF11/TRAT1/TTK/AURKB/PRC1/KIFC1/KIF18B/KIF20A/AURKA/CCNB1/KIF4A"
# Count "26"
GSEA:gseGO {clusterProfiler}
值得注意是此函数没有readable
参数。如果差异基因是ENTREZID类型,可使用setReadable()
函数再进行转换
ego2 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "CC",
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
t(ego2@result[1,])
# GO:0000228
# ID "GO:0000228"
# Description "nuclear chromosome"
# setSize "176"
# enrichmentScore "0.5697288"
# NES "2.495232"
# pvalue "1e-10"
# p.adjust "1.52e-09"
# qvalues "1.010526e-09"
# rank "2215"
# leading_edge "tags=36%, list=18%, signal=30%"
# core_enrichment "55388/10403/7153/4751/9837/332/51659/1111/891/4174/4171/5347/701/5888/23594/4998/4175/4173/54962/9918/1058/84296/699/81611/5111/64785/55506/641/8357/5427/23649/4176/58516/79980/5557/3066/11169/8607/1104/5558/4172/5424/5885/7283/10592/8914/51377/86/5393/6839/23212/3014/3619/5425/54107/11177/7273/6119/672/55320/675/5119/988/79172"
ego3 = setReadable(ego2, OrgDb = org.Hs.eg.db)
t(ego3@result[1,])
# GO:0000228
# ID "GO:0000228"
# Description "nuclear chromosome"
# setSize "176"
# enrichmentScore "0.5697288"
# NES "2.495232"
# pvalue "1e-10"
# p.adjust "1.52e-09"
# qvalues "1.010526e-09"
# rank "2215"
# leading_edge "tags=36%, list=18%, signal=30%"
# core_enrichment "MCM10/NDC80/TOP2A/NEK2/GINS1/BIRC5/GINS2/CHEK1/CCNB1/MCM5/MCM2/PLK1/BUB1B/RAD51/ORC6/ORC1/MCM6/MCM4/TIPIN/NCAPD2/CENPA/GINS4/BUB1/ANP32E/PCNA/GINS3/MACROH2A2/BLM/H3C10/POLE2/POLA2/MCM7/SINHCAF/DSN1/PRIM1/HDAC2/WDHD1/RUVBL1/RCC1/PRIM2/MCM3/POLD1/RAD21/TUBG1/SMC2/TIMELESS/UCHL5/ACTL6A/EXOSC9/SUV39H1/RRS1/H2AX/INCENP/POLD2/POLE3/BAZ1A/TTN/RPA3/BRCA1/MIS18BP1/BRCA2/CHMP1A/CDC5L/CENPO"
关于富集分析结果的可视化会在第三点记录,这里展示了一张专门使用于GO term的富集分析结果可视化。
goplot(ego)
1.2 KEGG
ORA:enrichKEGG {clusterProfiler}
- 这里的差异基因id类型必须要是entrez gene才可以
# R.utils::setOption("clusterProfiler.download.method","auto")
kk <- enrichKEGG(gene = gene,
organism = 'hsa',
pvalueCutoff = 0.05)
kk <- setReadable(kk, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(kk@result[1,])
# hsa04110
# ID "hsa04110"
# Description "Cell cycle"
# GeneRatio "11/94"
# BgRatio "124/8096"
# pvalue "1.641616e-07"
# p.adjust "3.398145e-05"
# qvalue "3.335072e-05"
# geneID "CDC45/CDC20/CCNB2/CCNA2/CDK1/MAD2L1/TTK/CHEK1/CCNB1/MCM5/PTTG1"
# Count "11"
GSEA:gseKEGG {clusterProfiler}
kk2 <- gseKEGG(geneList = geneList,
organism = 'hsa',
minGSSize = 120,
pvalueCutoff = 0.05,
verbose = FALSE)
kk2 <- setReadable(kk2, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(kk2@result[1,])
# hsa05169
# ID "hsa05169"
# Description "Epstein-Barr virus infection"
# setSize "193"
# enrichmentScore "0.433501"
# NES "1.937523"
# pvalue "2.243845e-07"
# p.adjust "1.705322e-05"
# qvalues "1.157352e-05"
# rank "2820"
# leading_edge "tags=39%, list=23%, signal=31%"
# core_enrichment "CXCL10/CCNA2/TAP1/ISG15/CCNE1/CCNE2/SKP2/STAT1/HLA-DRB4/HLA-DOB/MYC/CD3G/PSMD3/E2F1/IRAK1/CD247/CD3D/LYN/OAS1/RUNX3/OAS3/PSMD7/PLCG2/ADRM1/HDAC2/CYCS/E2F3/BAK1/CDK4/BID/CD3E/ICAM1/OAS2/PSMD14/DDX58/NFKBIB/MAPK13/SEM1/TNFAIP3/TAP2/CD19/PSMD8/IFNA21/SYK/PSMC3/NFKBIE/TNF/IL6/TLR2/PSMD2/FCER2/FADD/HLA-DQB1/PSMC4/TRAF2/RELB/HLA-G/CR2/CD40/EIF2AK2/NFKBIA/BCL2L11/SAP30/HLA-F/IRF9/IKBKE/CHUK/PSMD12/MAPK12/HLA-DMB/CALR/MAP2K3/PDIA3/HLA-DMA/PSMD1/MAPK14"
kegg通路可视化
- 单纯看一下被富集到的kegg通路
browseKEGG(kk, 'hsa04110')
- 结合差异倍数,看下被富集到的kegg通路
library("pathview")
pathview(gene.data = geneList,
pathway.id = "hsa04110",
species = "hsa",
limit = list(gene=max(abs(geneList)), cpd=1))
1.3 WikiPathways
ORA:enrichWP {clusterProfiler}
同样只接受entrez gene id
wp = enrichWP(gene, organism = "Homo sapiens")
wp <- setReadable(wp, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(wp@result[1,])
# WP2446
# ID "WP2446"
# Description "Retinoblastoma gene in cancer"
# GeneRatio "11/108"
# BgRatio "88/7847"
# pvalue "2.650788e-08"
# p.adjust "6.388399e-06"
# qvalue "5.748024e-06"
# geneID "CDC45/CCNB2/TOP2A/RRM2/CCNA2/CDK1/CDT1/TTK/CHEK1/CCNB1/KIF4A"
# Count "11"
GSEA:gseWP {clusterProfiler}
wp2 = gseWP(geneList, organism = "Homo sapiens")
wp2 <- setReadable(wp2, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(wp2@result[1,])
# WP179
# ID "WP179"
# Description "Cell cycle"
# setSize "111"
# enrichmentScore "0.6632572"
# NES "2.764158"
# pvalue "1e-10"
# p.adjust "1.846667e-08"
# qvalues "1.536842e-08"
# rank "1234"
# leading_edge "tags=40%, list=10%, signal=36%"
# core_enrichment "CDC45/CDC20/CCNB2/CCNA2/CDK1/TTK/CHEK1/CCNB1/MCM5/PTTG1/MCM2/CDC25A/CDC6/PLK1/ESPL1/CCNE1/ORC6/ORC1/CCNE2/MCM6/MCM4/DBF4/SKP2/CDC25B/BUB1/MYC/PCNA/E2F1/CDKN2A/CDC7/MCM7/SFN/HDAC2/E2F3/CDKN2C/PKMYT1/CDC25C/CDK4/MCM3/RAD21/CHEK2/TFDP1/E2F5/YWHAZ"
1.4 ReactomePA
ORA:enrichPathway {ReactomePA}
可以设置readable
参数
library(ReactomePA)
rp <- enrichPathway(gene, pvalueCutoff = 0.05, readable=TRUE)
t(rp@result[1,])
# R-HSA-69620
# ID "R-HSA-69620"
# Description "Cell Cycle Checkpoints"
# GeneRatio "22/142"
# BgRatio "294/10856"
# pvalue "2.904514e-11"
# p.adjust "1.289604e-08"
# qvalue "1.054797e-08"
# geneID "CDC45/CDCA8/MCM10/CDC20/CENPE/CCNB2/NDC80/UBE2C/SKA1/CENPM/CENPN/UBE2S/CCNA2/CDK1/ERCC6L/MAD2L1/KIF18A/BIRC5/AURKB/CHEK1/CCNB1/MCM5"
# Count "22"
GSEA:gsePathway {ReactomePA}
rp2 <- gsePathway(geneList,
pvalueCutoff = 0.2,
pAdjustMethod = "BH")
rp2 <- setReadable(rp2, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(rp2[1,])
# R-HSA-141424
# ID "R-HSA-141424"
# Description "Amplification of signal from the kinetochores"
# setSize "81"
# enrichmentScore "0.7111667"
# NES "2.825675"
# pvalue "1e-10"
# p.adjust "3.889189e-09"
# qvalues "2.8734e-09"
# rank "759"
# leading_edge "tags=31%, list=6%, signal=29%"
# core_enrichment "CDCA8/CDC20/CENPE/NDC80/SKA1/CENPM/CENPN/ERCC6L/MAD2L1/KIF18A/BIRC5/AURKB/KIF2C/PLK1/BUB1B/ZWINT/CENPU/SPC25/CENPI/CENPA/BUB1/CENPF/ZWILCH/DSN1/KNTC1"
ReactomePA富集通路可视化
viewPathway("Cell Cycle Checkpoints",
readable = TRUE,
foldChange = geneList)
1.5 Disease
DO:disease ontology
NCG:Network of Cancer Gene
DisGeNET:disease gene network
ORA
enrichDO {DOSE}
enrichNCG {DOSE}
-
enrichDGN {DOSE}
以disease ontology为例
library(DOSE)
edo <- enrichDO(gene = gene,
ont = "DO",
pvalueCutoff = 0.05,
pAdjustMethod = "BH",
universe = names(geneList),
minGSSize = 5,
maxGSSize = 500,
qvalueCutoff = 0.05,
readable = TRUE)
t(edo[1,])
# DOID:1612
# ID "DOID:1612"
# Description "breast cancer"
# GeneRatio "26/147"
# BgRatio "480/6268"
# pvalue "4.099574e-05"
# p.adjust "0.01182727"
# qvalue "0.01106885"
# geneID "MMP1/S100A9/FOXM1/S100A8/TOP2A/S100A7/NEK2/CCNA2/MAD2L1/BIRC5/S100P/EZH2/AURKA/CCNB1/PTTG1/CA12/SCGB2A2/CCN5/ERBB4/FOXA1/SCGB1D2/PIP/GATA3/NAT1/PGR/AGR2"
# Count "26"
由于疾病与基因突变有密不可分的关系,因此也可以对snp突变位点做疾病类型基因集的富集分析
GSEA
gseDO{DOSE}
gseNCG{DOSE}
-
gseDGN{DOSE}
以disease ontology为例
edo2 <- gseDO(geneList,
minGSSize = 120,
pvalueCutoff = 0.2,
pAdjustMethod = "BH",
verbose = FALSE)
edo2 <- setReadable(edo2, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(edo2[1,])
# DOID:0050338
# ID "DOID:0050338"
# Description "primary bacterial infectious disease"
# setSize "214"
# enrichmentScore "0.4569856"
# NES "2.083916"
# pvalue "1.194332e-09"
# p.adjust "1.875102e-07"
# qvalues "7.165995e-08"
# rank "1850"
# leading_edge "tags=35%, list=15%, signal=30%"
# core_enrichment "MMP1/BCL2A1/CXCL10/CXCL11/CAMP/IDO1/CCL20/ICOS/MMP9/CXCL8/PHGDH/TAP1/CD38/CTLA4/CCL5/LCN2/TREM1/LCK/PRF1/IL2RA/CCL2/SELL/PRDM1/MUC16/HLA-DRB4/GZMA/CCL4/CCR7/PSMB9/LDHC/CD247/IFNG/CD40LG/TXNRD1/DERL1/KIR2DL3/MC3R/CD86/HSPD1/IL32/CCR5/TLR1/ICAM1/SH2D1A/CCL22/PTX3/ADA/IRF1/MRC1/CD27/TAP2/MEFV/BPI/VEGFA/CD14/PTPN22/SLAMF1/B3GAT1/MIF/TNF/P2RX7/PLAUR/IL6/LTA/TLR2/BTNL2/CXCR4/CR1/PDCD1/PTGS2/APOE/CHIT1/HLA-DQB1/VCP"
1.6 meshes
ms <- enrichMeSH(gene, MeSHDb = "MeSH.Hsa.eg.db", database='gendoo', category = 'C')
ms <- setReadable(ms, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(ms[1,])
# D043171
# ID "D043171"
# Description "Chromosomal Instability"
# GeneRatio "19/196"
# BgRatio "198/16528"
# pvalue "2.431961e-12"
# p.adjust "3.051352e-09"
# qvalue "2.413276e-09"
# geneID "MMP1/CDC20/FOXM1/CENPE/MYBL2/NDC80/TOP2A/HJURP/NEK2/MAD2L1/CDT1/BIRC5/TTK/AURKB/CHEK1/AURKA/CCNB1/PTTG1/MAPT"
# Count "19"
ms2 <- gseMeSH(geneList, MeSHDb = "MeSH.Hsa.eg.db", database = 'gene2pubmed', category = "G")
ms2 <- setReadable(ms2, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(ms[2,])
# D000782
# ID "D000782"
# Description "Aneuploidy"
# GeneRatio "23/196"
# BgRatio "320/16528"
# pvalue "4.68358e-12"
# p.adjust "3.051352e-09"
# qvalue "2.413276e-09"
# geneID "MMP1/CDCA8/CDC20/CENPE/TOP2A/NEK2/CENPM/CENPN/CCNA2/CDK1/MAD2L1/BIRC5/TTK/AURKB/CHAF1B/CHEK1/AURKA/KIF4A/PTTG1/SCGB2A2/PIP/NAT1/PGR"
# Count "23"
2、自定义基因集富集分析
- 关键是设置
TERM2NAME
参数,提供自定义的基因集。 - 格式为一个两列的dataframe:
term
列为基因集名(重复),gene
列为基因名。例如下图 - 来自MsigDB的基因集数据
msigdb = clusterProfiler::read.gmt("h.all.v7.4.entrez.gmt")
head(msigdb)
# term gene
# 1 HALLMARK_TNFA_SIGNALING_VIA_NFKB 3726
# 2 HALLMARK_TNFA_SIGNALING_VIA_NFKB 2920
# 3 HALLMARK_TNFA_SIGNALING_VIA_NFKB 467
# 4 HALLMARK_TNFA_SIGNALING_VIA_NFKB 4792
# 5 HALLMARK_TNFA_SIGNALING_VIA_NFKB 7128
# 6 HALLMARK_TNFA_SIGNALING_VIA_NFKB 5743
2.1 ORA:enricher {clusterProfiler}
x <- enricher(gene, TERM2GENE = msigdb)
x <- setReadable(x, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(x[1,])
# HALLMARK_G2M_CHECKPOINT
# ID "HALLMARK_G2M_CHECKPOINT"
# Description "HALLMARK_G2M_CHECKPOINT"
# GeneRatio "31/108"
# BgRatio "200/4383"
# pvalue "1.484553e-17"
# p.adjust "5.938212e-16"
# qvalue "5.313137e-16"
# geneID "CDC45/CDC20/KIF23/CENPE/MYBL2/CCNB2/NDC80/TOP2A/UBE2C/PBK/NUSAP1/TPX2/TACC3/NEK2/SLC7A5/UBE2S/CCNA2/CDK1/MAD2L1/BIRC5/KIF11/EZH2/TTK/AURKB/GINS2/CHEK1/PRC1/AURKA/KIF4A/MCM5/PTTG1"
# Count "31"
2.2 GSEA:GSEA {clusterProfiler}
y <- GSEA(geneList, TERM2GENE = msigdb)
y <- setReadable(y, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
t(y[1,])
# HALLMARK_ALLOGRAFT_REJECTION
# ID "HALLMARK_ALLOGRAFT_REJECTION"
# Description "HALLMARK_ALLOGRAFT_REJECTION"
# setSize "199"
# enrichmentScore "0.5555282"
# NES "2.456838"
# pvalue "1e-10"
# p.adjust "5e-10"
# qvalues "2e-10"
# rank "2657"
# leading_edge "tags=53%, list=21%, signal=42%"
# core_enrichment "CXCL13/CXCL9/GZMB/TRAT1/MMP9/TAP1/CCL5/WARS1/LCK/PRF1/IL2RA/CCR1/STAT1/CCL2/CD2/HLA-DOB/GZMA/RIPK2/IL2RG/FYB1/CCL4/CD3G/CD7/CDKN2A/NME1/LTB/IL7/CD247/CD3D/LYN/CCL7/MAP4K1/SIT1/IFNG/CTSS/CD40LG/KLRD1/ITK/CD8A/BCAT1/CD86/ELANE/CD96/ITGB2/PTPRC/CCR5/CD3E/TLR1/ICAM1/IL27RA/LCP2/CCL22/IRF8/AARS1/SRGN/IL15/IL18RAP/CD28/GBP2/NCF4/TAP2/CSK/CCL13/GPR65/ABCE1/PSMB10/NCK1/IGSF6/TNF/MRPL3/CD1D/IL6/CRTAM/TLR2/WAS/HIF1A/EGFR/TRAF2/FCGR2B/HLA-G/GCNT1/BRCA1/IL13/ELF4/CXCR3/FASLG/CD40/HCLS1/CCL11/HDAC9/NCR1/UBE2D1/IL9/DEGS1/IL12B/PRKCG/SOCS1/UBE2N/IL2/CCR2/IL12A/IL2RB/IL11/TLR6/HLA-DMB"