SCENIC运行的数据准备
需要表达矩阵和细胞信息两个文件。
1.包的下载
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::version()
# If your bioconductor version is previous to 3.9, see the section bellow
## Required
BiocManager::install(c("AUCell", "RcisTarget"))
BiocManager::install(c("GENIE3")) # Optional. Can be replaced by GRNBoost
## Optional (but highly recommended):
# To score the network on cells (i.e. run AUCell):
BiocManager::install(c("zoo", "mixtools", "rbokeh"))
# For various visualizations and perform t-SNEs:
BiocManager::install(c("DT", "NMF", "pheatmap", "R2HTML", "Rtsne"))
# To support paralell execution (not available in Windows):
BiocManager::install(c("doMC"))
# To export/visualize in http://scope.aertslab.org
if (!requireNamespace("devtools", quietly = TRUE))
install.packages("devtools")
devtools::install_github("aertslab/SCopeLoomR", build_vignettes = TRUE)
library(SCENIC)
if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools")
devtools::install_github("aertslab/SCENIC")
packageVersion("SCENIC")
2. 评分数据库的下载
手动下载两个文件,然后下载MD5 & SHA Checksum Utility软件MD5,和官网进行比对,匹配的话再进行后续的分析。
dbFiles <- c("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-500bp-upstream-7species.mc9nr.feather",
"https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-tss-centered-10kb-7species.mc9nr.feather")
# dir.create("cisTarget_databases"); setwd("cisTarget_databases") # if needed
for(featherURL in dbFiles)
{
download.file(featherURL, destfile=basename(featherURL)) # saved in current dir}
3. 表达数据的载入
GEO数据的下载
if (!requireNamespace("GEOquery", quietly = TRUE)) BiocManager::install("GEOquery")
library(GEOquery)
geoFile <- getGEOSuppFiles("GSE60361", makeDirectory=FALSE)
#解压文件
gzFile <- grep("Expression", basename(rownames(geoFile)), value=TRUE)
txtFile <- gsub(".gz", "", gzFile)
gunzip(gzFile, destname=txtFile, remove=TRUE)
library(data.table)
geoData <- fread(txtFile, sep="\t")
#提取基因名
geneNames <- unname(unlist(geoData[,1, with=FALSE]))
exprMatrix <- as.matrix(geoData[,-1, with=FALSE])
rm(geoData)
dim(exprMatrix)
rownames(exprMatrix) <- geneNames
exprMatrix <- exprMatrix[unique(rownames(exprMatrix)),]
dim(exprMatrix)
exprMatrix[1:5,1:4]
# Remove file downloaded:
file.remove(txtFile)
或者直接读取表达数据
#直接读取表达谱数据
library(data.table)
exprMatrix<-fread("protein.txt",sep="\t")#这是一个LUAD的表达矩阵,counts数。
exprMatrix<-as.data.frame(exprMatrix)
EGU<-unique(exprMatrix$id)
exprMatrix<-exprMatrix[match(EGU,exprMatrix$id),]
rownames(exprMatrix)<-exprMatrix$id
exprMatrix<-exprMatrix[,-1]
class(exprMatrix)
#载入的表达矩阵counts_filter为基因表达的count值,
#这里只通过使用edgeR包中的CPM()函数去除了文库大小,未做log处理
library(edgeR)
counts_filter<-cpm(exprMatrix)
构建一个样本信息表格
这里用的表达数据不是单细胞数据,这里样本表达只是随便构建的,但是要加上celltype这一列。
meta_filter3<-colnames(counts_filter)
b<-fread("cell.txt",header = T)
b<-as.data.frame(b)#需要先转换格式,再进行cbind
c<-counts_filter[1,]
c<-as.data.frame(c)
meta_filter3<-cbind(meta_filter3,b,c)
colnames(meta_filter3)<-c("cell","CellType","A")
class(meta_filter3)
str(meta_filter3)
4. 使用Seurat包创建对象
suppressMessages(library(Seurat))
Pollen <- CreateSeuratObject(counts = counts_filter,
meta.data =meta_filter3,
min.cells = 300,
min.features = 1500,
project = "Pollen")
Pollen
sce <- Pollen
提取sce对象中的表达矩阵与样品(cell)信息并保存
exprMat<-GetAssayData(object = sce,slot = "counts")
class(exprMat)
dim(exprMat)
exprMat[1:4,1:10]
cellInfo <-sce@meta.data
dim(cellInfo)
save(exprMat,cellInfo,file = 'Mat and cell.Rdata')
load("Mat and cell.RData")
5. SCENIC的初始设置
在后续分析中产生的数据会存储在R当前目录下的int文件夹,SCENIC输出的结果会存储在R当前目录下的output文件夹。
library(SCENIC)
org <- "hgnc"
dbDir <- "cisTarget_databases"#定位到RcisTarget databases的文件夹
myDatasetTitle <- "SCENIC analysis on Human cartilage" # Create a name for your analysis
data(defaultDbNames)
dbs <- defaultDbNames[[org]]
scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=10)
#scenicOptions <- readRDS("int/scenicOptions.Rds")
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
6. 将表达矩阵中未在cisTarget database中收录的基因去除
library(RcisTarget)
dbFilePath <- getDatabases(scenicOptions)[[1]]
motifRankings <- importRankings(dbFilePath)
genesInDatabase <- colnames(getRanking(motifRankings))
genesLeft_minCells<-rownames(exprMat)
length(genesLeft_minCells)
genesLeft_minCells_inDatabases <- genesLeft_minCells[which(genesLeft_minCells %in% genesInDatabase)]
length(genesLeft_minCells_inDatabases)
genesKept <- genesLeft_minCells_inDatabases
exprMat_filtered <- exprMat[genesKept, ]
exprMat_filtered<-exprMat_filtered[1:500,] #测试减少计算量减少计算量
dim(exprMat_filtered)
save(exprMat_filtered,file = 'exprMat_filteredmat.Rdata')
7. 运行SCENIC
计算相关性并对数据进行log处理
class(exprMat_filtered)
runCorrelation(as.matrix(exprMat_filtered), scenicOptions)
exprMat_filtered<-log2(exprMat_filtered+1)
library(GENIE3)#推断基因共表达网络
runGenie3(as.matrix(exprMat_filtered), scenicOptions)
save(exprMat_filtered,scenicOptions,file = "input_GENIE3_data.Rdata")
8. 构建GRN并对其进行打分
#为了节约电脑内存,这里重启了R并重新载入数据
load('exprMat_filteredmat.Rdata')
exprMat<-exprMat_filtered
logMat <- log2(exprMat+1)
dim(exprMat)
#运行SCENIC
library(SCENIC)
scenicOptions <- readRDS("int/scenicOptions.Rds")
scenicOptions@settings$verbose <- TRUE
scenicOptions@settings$nCores <- 10
scenicOptions@settings$seed <- 123
scenicOptions@settings$dbs <- scenicOptions@settings$dbs["10kb"]
runSCENIC_1_coexNetwork2modules(scenicOptions)
#为了省时间跑了top5.
runSCENIC_2_createRegulons(scenicOptions, coexMethod=c("top5perTarget"))
#这一步比较耗时
runSCENIC_3_scoreCells(scenicOptions, as.matrix(logMat))
接下来是可选步骤
1 .将网络转换为二进制格式(on/off)
aucellApp <- plotTsne_AUCellApp(scenicOptions, logMat)
#得到的thresholds比较保守,可以通过认为进行调整,然后保存。
savedSelections <- shiny::runApp(aucellApp)
# Save the modified thresholds:
newThresholds <- savedSelections$thresholds
scenicOptions@fileNames$int["aucell_thresholds",1] <- "int/newThresholds.Rds"
saveRDS(newThresholds, file=getIntName(scenicOptions, "aucell_thresholds"))
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
2. 调控子的聚类和降维
# It will create all combinations between the selected “number of PCs” and “perplexity”
nPcs <- c(5) #PC不能超过调节子的个数
scenicOptions@settings$seed <- 123 # same seed for all of them
# Calculates the t-SNE based on the regulon activity. (i.e. binary/continuous AUC)
fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50))
fileNames <- tsneAUC(scenicOptions, aucType="AUC", nPcs=nPcs, perpl=c(5,15,50), onlyHighConf=TRUE, filePrefix="int/tSNE_oHC")
# Plot as pdf (individual files in int/):
fileNames <- paste0("int/",grep(".Rds", grep("tSNE_", list.files("int"), value=T), value=T))
#绘图查看,需要先将细胞信息载入#load("Mat and cell.RData")
par(mfrow=c(length(nPcs), 3))
fileNames <- paste0("int/",grep(".Rds", grep("tSNE_AUC", list.files("int"), value=T, perl = T), value=T))
plotTsne_compareSettings(fileNames, scenicOptions, showLegend=FALSE, varName="CellType", cex=.5)
#将选择的TSNE保存,用于画图
scenicOptions@settings$defaultTsne$aucType <- "AUC"
scenicOptions@settings$defaultTsne$dims <- 5
scenicOptions@settings$defaultTsne$perpl <- 15
saveRDS(scenicOptions, file="int/scenicOptions.Rds")
3. 结果探索
3.1细胞状态
将AUC和TF表达投影到t-SNEs上
#logMat <- exprMat # Better if it is logged/normalized
aucellApp <- plotTsne_AUCellApp(scenicOptions, logMat) # default t-SNE
savedSelections <- shiny::runApp(aucellApp)
print(tsneFileName(scenicOptions))
tSNE_scenic <- readRDS(tsneFileName(scenicOptions))
aucell_regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
# Show TF expression:将转录因子的表达图画出来
par(mfrow=c(2,3))
AUCell::AUCell_plotTSNE(tSNE_scenic$Y, exprMat, aucell_regulonAUC[onlyNonDuplicatedExtended(rownames(aucell_regulonAUC))[c("HCFC1","DBP","TCF7","GABPA","ELF2")],], plots="Expression")
# Save AUC as PDF: 将设置新阈值后的AUCell保存。
Cairo::CairoPDF("output/Step4_BinaryRegulonActivity_tSNE_colByAUC.pdf", width=20, height=15)
par(mfrow=c(2,3))
AUCell::AUCell_plotTSNE(tSNE_scenic$Y, cellsAUC=aucell_regulonAUC, plots="AUC")
dev.off()
高密度图展示最稳定的高密度状态
library(KernSmooth)
library(RColorBrewer)
dens2d <- bkde2D(tSNE_scenic$Y, 1)$fhat
image(dens2d, col=brewer.pal(9, "YlOrBr"), axes=FALSE)
contour(dens2d, add=TRUE, nlevels=5, drawlabels=FALSE)
同时展示一些调控子
par(mfrow=c(1,2))
#2个TF
regulonNames <- c("HCFC1","DBP")
#plotTsne_rgb:基于cellCol进行颜色的分类
cellCol <- plotTsne_rgb(scenicOptions, regulonNames, aucType="AUC", aucMaxContrast=0.6)
text(0, 10, attr(cellCol,"red"), col="red", cex=.7, pos=4)
text(-20,-10, attr(cellCol,"green"), col="green3", cex=.7, pos=4)
#3个TF
regulonNames <- list(red=c("HCFC1","DBP"),
green=c("TCF7"),
blue=c( "ELF2"))
cellCol <- plotTsne_rgb(scenicOptions, regulonNames, aucType="Binary")
text(5, 15, attr(cellCol,"red"), col="red", cex=.7, pos=4)
text(5, 15-4, attr(cellCol,"green"), col="green3", cex=.7, pos=4)
text(5, 15-8, attr(cellCol,"blue"), col="blue", cex=.7, pos=4)
3.2 GRN:Regulon靶标和模序
regulons <- loadInt(scenicOptions, "regulons")
regulons[c("TCF7","ELF2")]
#调节子有10个以上基因才会有AUCell得分
regulons <- loadInt(scenicOptions, "aucell_regulons")
head(cbind(onlyNonDuplicatedExtended(names(regulons))))
regulonTargetsInfo <- loadInt(scenicOptions, "regulonTargetsInfo")
tableSubset <- regulonTargetsInfo[TF=="ELF2" & highConfAnnot==TRUE]
viewMotifs(tableSubset)
#TF motif的完整列表
motifEnrichment_selfMotifs_wGenes <- loadInt(scenicOptions, "motifEnrichment_selfMotifs_wGenes")
tableSubset <- motifEnrichment_selfMotifs_wGenes[highlightedTFs=="ELF2"]
viewMotifs(tableSubset)
3.3 已知细胞类型/簇的调节子
#根据平均调节子活性进行分群
regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
regulonAUC <- regulonAUC[onlyNonDuplicatedExtended(rownames(regulonAUC)),]
regulonActivity_byCellType <- sapply(split(rownames(cellInfo), cellInfo$CellType),
function(cells) rowMeans(getAUC(regulonAUC)[,cells]))
regulonActivity_byCellType_Scaled <- t(scale(t(regulonActivity_byCellType), center = T, scale=T))
#展示细胞类型和调节子的相关性热图。
pheatmap::pheatmap(regulonActivity_byCellType_Scaled, #fontsize_row=3,
color=colorRampPalette(c("blue","white","red"))(100), breaks=seq(-3, 3, length.out = 100),
treeheight_row=10, treeheight_col=10, border_color=NA)
#热图的表格展示相关性
topRegulators <- reshape2::melt(regulonActivity_byCellType_Scaled)
colnames(topRegulators) <- c("Regulon", "CellType", "RelativeActivity")
#挑出正相关的
topRegulators <- topRegulators[which(topRegulators$RelativeActivity>0),]
viewTable(topRegulators)
#保存
write.csv(topRegulators,file="topRegulators.csv",quote=F)
二进制版本,细胞类型/簇中调控子活跃的细胞百分比
minPerc <- .7#这里设置0.7比较好,说明有70%该调节子在这种细胞类型中
binaryRegulonActivity <- loadInt(scenicOptions, "aucell_binary_nonDupl")
cellInfo_binarizedCells <- cellInfo[which(rownames(cellInfo)%in% colnames(binaryRegulonActivity)),, drop=FALSE]
regulonActivity_byCellType_Binarized <- sapply(split(rownames(cellInfo_binarizedCells), cellInfo_binarizedCells$CellType),
function(cells) rowMeans(binaryRegulonActivity[,cells, drop=FALSE]))
binaryActPerc_subset <- regulonActivity_byCellType_Binarized[which(rowSums(regulonActivity_byCellType_Binarized>minPerc)>0),]
pheatmap::pheatmap(binaryActPerc_subset, # fontsize_row=5,
color = colorRampPalette(c("white","pink","red"))(100), breaks=seq(0, 1, length.out = 100),
treeheight_row=10, treeheight_col=10, border_color=NA)
topRegulators <- reshape2::melt(regulonActivity_byCellType_Binarized)
colnames(topRegulators) <- c("Regulon", "CellType", "RelativeActivity")
topRegulators <- topRegulators[which(topRegulators$RelativeActivity>minPerc),]
viewTable(topRegulators)
write.csv(topRegulators,file="topRegulators_binary.csv",quote=F)