Y叔的书 - 笔记
chapter 3 - universal enrichment analysis
input data
for gene set enrichment analysis - a ranked list of genes
note : no duplicated gene ID allowed
example
# 使用‘DOSE’包中的示例数据
data(geneList, package="DOSE")
head(geneList)
#设定fold change 大于2倍的为差异表达基因
gene <- names(geneList)[abs(geneList) > 2]
head(gene)
wikipathways analysis
a continuously updated pathway database by a community of researchers
monthly releases of gmt files
Download the appropriate gmt file & generate TERM2GENE & TERM2NAME to use enricher & GSEA functions
library(magrittr)
suppressPackageStartupMessages(library(clusterProfiler))
data(geneList, package="DOSE")
str(geneList)
gene <- names(geneList)[abs(geneList) > 2]
head(gene)
wpgmtfile <- system.file("extdata/wikipathways-20180810-gmt-Homo_sapiens.gmt", package="clusterProfiler")
wp2gene <- read.gmt(wpgmtfile)
wp2gene <- wp2gene %>% tidyr::separate(ont, c("name","version","wpid","org"), "%")
wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE
wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME
# 真得学好tidyr / dplyr这几个包
ewp <- enricher(gene, TERM2GENE = wpid2gene, TERM2NAME = wpid2name)
head(ewp)
also, you can install the rWikiPathways package to manually download gmt file
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("rWikiPathways")
# use downloadPathwayArchive function to download the latest gmt files
cell marker
cell_markers <- vroom::vroom('http://bio-bigdata.hrbmu.edu.cn/CellMarker/download/Human_cell_markers.txt') %>%
tidyr::unite("cellMarker", tissueType, cancerType, cellName, sep=", ") %>%
dplyr::select(cellMarker, geneID) %>%
dplyr::mutate(geneID = strsplit(geneID, ', '))
cell_markers
y <- enricher(gene, TERM2GENE=cell_markers, minGSSize=1)
DT::datatable(as.data.frame(y))
MSigDb analysis
Molecular Signatures Database - 8 major collections
download gmt files from Broad Institute
H: hallmark gene sets
C1: positional gene sets
C2: curated gene sets
C3: motif gene sets
C4: computational gene sets
C5: GO gene sets
C6: oncogenic signatures
C7: immunologic signatures
package msigdbr - recommend by uncle Y
library(msigdbr)
msigdbr_show_species()
# retrieve all human gene sets
m_df <- msigdbr(species = "Homo sapiens")
head(m_df, 2) %>% as.data.frame
# or specific collection
m_t2g <- msigdbr(species = "Homo sapiens", category = "C6") %>%
dplyr::select(gs_name, entrez_gene)
head(m_t2g)
question:
as mentioned by uncle Y:
"using C3 to test whether the genes are up/down-regulated by sharing specific motif"
background knowledge - C3