Step2-Get Peak Set
Creating a peak set, summarized experiment and LSI clustering
准备工作
注意这里我们需要v2的Seurat,之前有安装v3版本的Seurat得降级,降级的指南在:https://satijalab.org/seurat/install.html#previous
还可以参考这篇文章,改变包的libPath就不会覆盖原来的v3: https://www.jianshu.com/p/20dc76607f1b,但是要注意,报错的时候可能会要求你安装SDMTools,如下:
install.packages("https://cran.rstudio.com//src/contrib/Archive/SDMTools/SDMTools_1.1-221.2.tar.gz", repos = NULL)
在CRAN的官网上https://cran.r-project.org/web/packages/Seurat/index.html里面有提到SDMTools不在标准的CRAN库里面
.libPaths()
[1] "C:/Users/lenovo/Documents/R/win-library/3.6"
[2] "C:/Program Files/R/R-3.6.2/library"
.libPaths("C:/Users/lenovo/Documents/R/win-library/Seurat2")#添加
可能还会提示你缺什么包,反正缺什么补什么,最后总会成功的
如果还不成功,尝试一下把上面的源码自己下载下来本地安装:
#https://cran.rstudio.com//src/contrib/Archive/Seurat/Seurat_2.3.0.tar.gz
install.packages("C:/Users/lenovo/Desktop/Seurat_2.3.0.tar.gz", repos = NULL, type = "source")
成功!
library(Seurat,lib.loc = 'C:/Users/lenovo/Documents/R/win-library/Seurat2')
packageVersion("Seurat",lib.loc = 'C:/Users/lenovo/Documents/R/win-library/Seurat2')
[1] ‘2.3.0’
用到的函数
library(Matrix)
library(SummarizedExperiment)
library(matrixStats)
library(readr)
library(GenomicRanges)
library(magrittr)
library(edgeR)
library(Seurat)#这里如果一开始装了v3的Seurat,就换成上面的library指定lib加载Seurat2
library(BSgenome.Hsapiens.UCSC.hg19)
set.seed(1)
countInsertions <- function(query, fragments, by = "RG"){
#Count By Fragments Insertions
inserts <- c(
GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]),
GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by])
)
by <- "RG"
overlapDF <- DataFrame(findOverlaps(query, inserts, ignore.strand = TRUE, maxgap=-1L, minoverlap=0L, type = "any"))
overlapDF$name <- mcols(inserts)[overlapDF[, 2], by]
overlapTDF <- transform(overlapDF, id = match(name, unique(name)))
#Calculate Overlap Stats
inPeaks <- table(overlapDF$name)
total <- table(mcols(inserts)[, by])
total <- total[names(inPeaks)]
frip <- inPeaks / total
#Summarize
sparseM <- Matrix::sparseMatrix(#猜测这个稀疏矩阵的每一行是基因组的一个bin(2500bp),每一列是一个细胞,用barcode表示
i = overlapTDF[, 1],
j = overlapTDF[, 4],
x = rep(1, nrow(overlapTDF)),
dims = c(length(query), length(unique(overlapDF$name))))
colnames(sparseM) <- unique(overlapDF$name)
total <- total[colnames(sparseM)]
frip <- frip[colnames(sparseM)]
out <- list(counts = sparseM, frip = frip, total = total)
return(out)
}
seuratLSI <- function(mat, nComponents = 50, binarize = TRUE, nFeatures = NULL){
#TF IDF LSI adapted from flyATAC 建议事先了解TF-IDF的原理和主题模型
cs <- Matrix::colSums(mat)
if(binarize){
message(paste0("Binarizing matrix..."))
mat@x[mat@x > 0] <- 1 #算法原理要求是二进制的数据
}
if(!is.null(nFeatures)){
message(paste0("Getting top ", nFeatures, " features..."))
mat <- mat[head(order(Matrix::rowSums(mat),decreasing = TRUE),nFeatures),]
}#选择前nFeatures个bins
#Calc TF IDF
message("Computing Term Frequency IDF...")
freqs <- t(t(mat)/Matrix::colSums(mat))#列求和就是每个细胞的counts总数,获得每个bins counts的在每个细胞内频率值(即每个词语在各自文档的频率,词频)
idf <- as(log(1 + ncol(mat) / Matrix::rowSums(mat)), "sparseVector")#把细胞总数除以每个bins的总出现次数,作为权重,也就是说那些越稀有的bins拥有权重越大,也就是这种bins更具有细胞类型特异性。
tfidf <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% freqs#相乘再相加,矩阵乘法,对角化权重值,每个词语权重乘以词语在各自文档的词频再相加就得到了词语的重要程度,以实现特征选择与降维
#Calc SVD then LSI 获得的TF-TDF matrix用于奇异值分解
message("Computing SVD using irlba...")
svd <- irlba::irlba(tfidf, nComponents, nComponents)
svdDiag <- matrix(0, nrow=nComponents, ncol=nComponents)
diag(svdDiag) <- svd$d
matSVD <- t(svdDiag %*% t(svd$v))
rownames(matSVD) <- colnames(mat)
colnames(matSVD) <- paste0("PC",seq_len(ncol(matSVD)))
#Make Seurat Object
message("Making Seurat Object...")
mat <- mat[1:100,] + 1
obj <- CreateSeuratObject(mat, project='scATAC', min.cells=0, min.genes=0)
obj <- SetDimReduction(object = obj, reduction.type = "pca", slot = "cell.embeddings", new.data = matSVD)#在Seurat对象中添加作者用TF-IDF&LSI处理的结果(重要的dimensions,后面用来cluster细胞)
obj <- SetDimReduction(object = obj, reduction.type = "pca", slot = "key", new.data = "PC")
return(obj)
}
addClusters <- function(obj, minGroupSize = 50, dims.use = seq_len(50), initialResolution = 0.8){#看看作者手写的聚类算法,文章中作者提到了他们发明的“两步聚类法”
#First Iteration of Find Clusters
currentResolution <- initialResolution
obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = dims.use, resolution = currentResolution, print.output = FALSE)
minSize <- min(table(obj@meta.data[[paste0("res.",currentResolution)]]))
nClust <- length(unique(paste0(obj@meta.data[[paste0("res.",currentResolution)]])))
message(sprintf("Current Resolution = %s, No of Clusters = %s, Minimum Cluster Size = %s", currentResolution, nClust, minSize))
#If clusters are smaller than minimum group size#注意每个cluster的大小要满足要求
while(minSize <= minGroupSize){
obj@meta.data <- obj@meta.data[,-which(colnames(obj@meta.data)==paste0("res.",currentResolution))]
currentResolution <- currentResolution*initialResolution
obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = dims.use, resolution = currentResolution, print.output = FALSE, force.recalc = TRUE)
minSize <- min(table(obj@meta.data[[paste0("res.",currentResolution)]]))
nClust <- length(unique(paste0(obj@meta.data[[paste0("res.",currentResolution)]])))
message(sprintf("Current Resolution = %s, No of Clusters = %s, Minimum Cluster Size = %s", currentResolution, nClust, minSize))
}
return(obj)
}
extendedPeakSet <- function(df, BSgenome = NULL, extend = 250, blacklist = NULL, nSummits = 100000){
#Helper Functions
readSummits <- function(file){
df <- suppressMessages(data.frame(readr::read_tsv(file, col_names = c("chr","start","end","name","score"))))
df <- df[,c(1,2,3,5)] #do not keep name column it can make the size really large
return(GenomicRanges::makeGRangesFromDataFrame(df=df,keep.extra.columns = TRUE,starts.in.df.are.0based = TRUE))
}#这个函数就是把读入的dataframe变成GRanges对象
nonOverlappingGRanges <- function(gr, by = "score", decreasing = TRUE, verbose = FALSE){#这个函数把gr变成union peaks
stopifnot(by %in% colnames(mcols(gr)))
clusterGRanges <- function(gr, filter = TRUE, by = "score", decreasing = TRUE){
gr <- sort(sortSeqlevels(gr))
r <- GenomicRanges::reduce(gr, min.gapwidth=0L, ignore.strand=TRUE)#the reduce method will align the ranges and merge overlapping ranges to produce a simplified set,也就是gr里所有的bins取并集,重叠的区域都被合并到一个区间里(union)
o <- findOverlaps(gr,r)
mcols(gr)$cluster <- subjectHits(o)#对原来的gr编号,也就是每个bins都会属于一个union,多个bins可以属于一个union所以把它们算成一个cluster
gr <- gr[order(mcols(gr)[,by], decreasing = decreasing),]
gr <- gr[!duplicated(mcols(gr)$cluster),]#只保留唯一的cluster
gr <- sort(sortSeqlevels(gr))
mcols(gr)$cluster <- NULL#因为gr已经是union以后的cluster了(重叠peaks完成合并),所以这个label不需要了
return(gr)
}
if(verbose){
message("Converging", appendLF = FALSE)
}
i <- 0
gr_converge <- gr
while(length(gr_converge) > 0){
if(verbose){
message(".", appendLF = FALSE)
}
i <- i + 1
gr_selected <- clusterGRanges(gr = gr_converge, filter = TRUE, by = by, decreasing = decreasing)
gr_converge <- subsetByOverlaps(gr_converge ,gr_selected, invert=TRUE) #blacklist selected gr
if(i == 1){ #if i=1 then set gr_all to clustered
gr_all <- gr_selected
}else{
gr_all <- c(gr_all, gr_selected)#gr逐一添加的过程
}
}
if(verbose){
message("\nSelected ", length(gr_all), " from ", length(gr))
}
gr_all <- sort(sortSeqlevels(gr_all))
return(gr_all)
}
#Check-------
stopifnot(extend > 0)
stopifnot("samples" %in% colnames(df))
stopifnot("groups" %in% colnames(df))
stopifnot("summits" %in% colnames(df))
stopifnot(!is.null(BSgenome))
stopifnot(all(apply(df,1,function(x){file.exists(paste0(x[3]))})))
#------------
#Deal with blacklist
if(is.null(blacklist)){
blacklist <- GRanges()
}else if(is.character(blacklist)){
blacklist <- rtracklayer::import.bed(blacklist)
}
stopifnot(inherits(blacklist,"GenomicRanges"))
#------------
#Time to do stuff 这里才正式运行
chromSizes <- GRanges(names(seqlengths(BSgenome)), IRanges(1, seqlengths(BSgenome)))
chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse")
groups <- unique(df$groups)
groupGRList <- GenomicRanges::GenomicRangesList(lapply(seq_along(groups), function(i){
df_group = df[which(df$groups==groups[i]),]
grList <- GenomicRanges::GenomicRangesList(lapply(paste0(df_group$summits), function(x){
extended_summits <- readSummits(x) %>% #peaks预处理
resize(., width = 2 * extend + 1, fix = "center") %>%
subsetByOverlaps(.,chromSizes,type="within") %>% #只取标准命名的染色体
subsetByOverlaps(.,blacklist,invert=TRUE) %>% #invert,排除“黑名单”bed
nonOverlappingGRanges(., by="score", decreasing=TRUE)
extended_summits <- extended_summits[order(extended_summits$score,decreasing=TRUE)]
if(!is.null(nSummits)){
extended_summits <- head(extended_summits, nSummits)
}
mcols(extended_summits)$scoreQuantile <- trunc(rank(mcols(extended_summits)$score))/length(mcols(extended_summits)$score)
extended_summits
}))
#Non Overlapping
grNonOverlapping <- nonOverlappingGRanges(unlist(grList), by = "scoreQuantile", decreasing = TRUE)
#Free Up Memory
remove(grList)
gc()
grNonOverlapping
}))
grFinal <- nonOverlappingGRanges(unlist(groupGRList), by = "scoreQuantile", decreasing = TRUE)
grFinal <- sort(sortSeqlevels(grFinal))
return(grFinal)
}
#提取某个cluster的cell group,对行求和,也就是对这个cluster的 peaks counts求和,这个在第二次聚类时会用到:clusterSums
groupSums <- function(mat, groups = NULL, na.rm = TRUE, sparse = FALSE){
stopifnot(!is.null(groups))
stopifnot(length(groups) == ncol(mat))
gm <- lapply(unique(groups), function(x) {
if (sparse) {
Matrix::rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
}else {
rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
}
}) %>% Reduce("cbind", .)
colnames(gm) <- unique(groups)
return(gm)
}
输入fragments文件、运行
#-------------------------------------------------------------------------------------------------
# Start
#-------------------------------------------------------------------------------------------------
fragmentFiles <- list.files("data", pattern = ".rds", full.names = TRUE)
#-------------------------------------------------------------------------------------------------
# Get Counts In Windows
#-------------------------------------------------------------------------------------------------
genome <- BSgenome.Hsapiens.UCSC.hg19
chromSizes <- GRanges(names(seqlengths(genome)), IRanges(1, seqlengths(genome)))
chromSizes <- GenomeInfoDb::keepStandardChromosomes(chromSizes, pruning.mode = "coarse")
windows <- unlist(tile(chromSizes, width = 2500))#基因组划分成宽度2500的bins
countsList <- lapply(seq_along(fragmentFiles), function(i){
message(sprintf("%s of %s", i, length(fragmentFiles)))
counts <- countInsertions(windows, readRDS(fragmentFiles[i]), by = "RG")[[1]]
counts
})
mat <- lapply(countsList, function(x) x) %>% Reduce("cbind",.)#生成的是dgCMatrix-class,这是一种比较特殊的数据类型,稀疏矩阵
dim(mat)#注意到3770正是上一步正式过滤以后(tss enrich>8,frag>1000)剩下的细胞数,1238288乘以2500大概就是hg19基因组大小
[1] 1238288 3770
mat[1:4,1:4]
4 x 4 sparse Matrix of class "dgCMatrix"
PBMC#TGACAACCATTGTTCT-1 PBMC#GAAGTCTTCTCTGTTA-1 PBMC#TAAGTGCGTTTCGATG-1
[1,] . . .
[2,] . . .
[3,] . . .
[4,] . . .
PBMC#CTTAATCCATCGGCTG-1
[1,] .
[2,] .
[3,] .
[4,]
remove(countsList)
gc()
#-------------------------------------------------------------------------------------------------
# Run LSI Clustering with Seurat
#-------------------------------------------------------------------------------------------------
set.seed(1)
message("Making Seurat LSI Object...")
obj <- seuratLSI(mat, nComponents = 25, nFeatures = 20000)
message("Adding Graph Clusters...")
obj <- addClusters(obj, dims.use = 2:25, minGroupSize = 200, initialResolution = 0.8)
saveRDS(obj, "results/Save-LSI-Windows-Seurat.rds")
clusterResults <- split(rownames(obj@meta.data), paste0("Cluster",obj@meta.data[,ncol(obj@meta.data)]))
remove(obj)
gc()
>obj <- addClusters(obj, dims.use = 2:25, minGroupSize = 200, initialResolution = 0.8)
#Current Resolution = 0.8, No of Clusters = 6, Minimum Cluster Size = 148
#Current Resolution = 0.64, No of Clusters = 6, Minimum Cluster Size = 142
#Current Resolution = 0.512, No of Clusters = 5, Minimum Cluster Size = 142
#Current Resolution = 0.4096, No of Clusters = 4, Minimum Cluster Size = 143
#Current Resolution = 0.32768, No of Clusters = 4, Minimum Cluster Size = 140
#Current Resolution = 0.262144, No of Clusters = 3, Minimum Cluster Size = 430
>obj@meta.data#最后一栏是聚类的group label
nGene nUMI orig.ident res.0.262144
PBMC#TGACAACCATTGTTCT-1 100 123 scATAC 0
PBMC#GAAGTCTTCTCTGTTA-1 100 128 scATAC 0
PBMC#TAAGTGCGTTTCGATG-1 100 142 scATAC 1
PBMC#CTTAATCCATCGGCTG-1 100 126 scATAC 1
PBMC#TAGCCCTAGCATACCT-1 100 117 scATAC 1
PBMC#TACGCAATCAACTTGG-1 100 124 scATAC 1
PBMC#TGCACCTGTTCCGCGA-1 100 127 scATAC 1
PBMC#GAGGTCCTCTCATATC-1 100 126 scATAC 0
str(clusterResults)
List of 3
$ Cluster0: chr [1:1875] "PBMC#TGACAACCATTGTTCT-1" "PBMC#GAAGTCTTCTCTGTTA-1" "PBMC#GAGGTCCTCTCATATC-1" "PBMC#AGTGCCGTCCTTACGC-1" ...
$ Cluster1: chr [1:1465] "PBMC#TAAGTGCGTTTCGATG-1" "PBMC#CTTAATCCATCGGCTG-1" "PBMC#TAGCCCTAGCATACCT-1" "PBMC#TACGCAATCAACTTGG-1" ...
$ Cluster2: chr [1:430] "PBMC#AACCTTTCAGAGATGC-1" "PBMC#TTACGTTAGTGCAACG-1" "PBMC#TCGCCTAGTTAACTCG-1" "PBMC#TTAGGTGTCAGGCCGT-1" ...
#-------------------------------------------------------------------------------------------------
# Get Cluster Beds
#-------------------------------------------------------------------------------------------------这个比较简单,就是把fragments文件里属于各个cluster的fragments提取出来制作成类似GRange的dataframe,输出以后就是各个cluster的bed文件(重要的bins)
dirClusters <- "results/LSI-Cluster-Beds/"
dir.create(dirClusters)
for(i in seq_along(fragmentFiles)){
fragments <-readRDS(fragmentFiles[i])
for(j in seq_along(clusterResults)){
message(sprintf("%s of %s", j, length(clusterResults)))
fragmentsj <- fragments[fragments$RG %in% clusterResults[[j]]]
if(length(fragmentsj) > 0){
out <- data.frame(
chr = c(seqnames(fragmentsj), seqnames(fragmentsj)),
start = c(as.integer(start(fragmentsj) - 1), as.integer(end(fragmentsj) - 1)),
end = c(as.integer(start(fragmentsj)), as.integer(end(fragmentsj)))
) %>% readr::write_tsv(
x = .,
append = TRUE,
path = paste0(dirClusters, paste0(names(clusterResults)[j], ".bed")),
col_names = FALSE)
}
}
}
#-------------------------------------------------------------------------------------------------
# Run MACS2
#-------------------------------------------------------------------------------------------------这里是对上一步得到的bed文件进行peaks calling
dirPeaks <- "results/LSI-Cluster-Peaks/"
method <- "q"
cutoff <- 0.05
shift <- -75
extsize <- 150
genome_size <- 2.7e9
for(j in seq_along(clusterResults)){
message(sprintf("%s of %s", j, length(clusterResults)))
clusterBedj <- paste0(dirClusters,names(clusterResults)[j],".bed")
cmdPeaks <- sprintf(
"macs2 callpeak -g %s --name %s --treatment %s --outdir %s --format BED --nomodel --call-summits --nolambda --keep-dup all",
genome_size,
names(clusterResults)[j],
clusterBedj,
dirPeaks
)
if (!is.null(shift) & !is.null(extsize)) {
cmdPeaks <- sprintf("%s --shift %s --extsize %s", cmdPeaks, shift, extsize)
}
if (tolower(method) == "p") {
cmdPeaks <- sprintf("%s -p %s", cmdPeaks, cutoff)
}else {
cmdPeaks <- sprintf("%s -q %s", cmdPeaks, cutoff)
}
message("Running Macs2...")
message(cmdPeaks)
system(cmdPeaks, intern = TRUE)
}
运行macs2时屏幕上打印出来的参数,要分开运行也行,把Cluster改成对应的数字就行。
(atac) msq 15:15:57 ~/project/atac
$ macs2 callpeak -g 2.7e+09 --name Cluster0 --treatment results/LSI-Cluster-Beds/Cluster0.bed --outdir ./results/LSI-Cluster-Peaks/ --format BED --nomodel --call-summits --nolambda --keep-dup all --shift -75 --extsize 150 -q 0.05
Well Done! 现在我们有了peaks
(atac) msq 15:29:17 ~/project/atac/results
$ ll
total 8
drwxrwxr-x 2 msq msq 4096 Apr 27 15:01 LSI-Cluster-Beds
drwxrwxr-x 2 msq msq 4096 Apr 27 15:25 LSI-Cluster-Peaks
好了,继续,用得到的peaks搞事情:
#-------------------------------------------------------------------------------------------------
# Make Non-Overlapping Peak Set
#-------------------------------------------------------------------------------------------------
df <- data.frame(
samples = gsub("\\_summits.bed","",list.files(dirPeaks, pattern = "\\_summits.bed", full.names = FALSE)),
groups = "scATAC",
summits = list.files(dirPeaks, pattern = "\\_summits.bed", full.names = TRUE)
)
unionPeaks <- extendedPeakSet(
df = df,
BSgenome = genome,
extend = 250,
blacklist = "data/hg19.blacklist.bed",
nSummits = 200000 #the top 200,000 extended summits (ranked by MACS2 score) were retained, generating a ‘cluster-specific peak set’ for each cluster.
)
unionPeaks <- unionPeaks[seqnames(unionPeaks) %in% paste0("chr",c(1:22,"X"))]
unionPeaks <- keepSeqlevels(unionPeaks, paste0("chr",c(1:22,"X")))
>unionPeaks #最后一栏的score就是summit.bed文件里最后一列的score
GRanges object with 132455 ranges and 2 metadata columns:
seqnames ranges strand | score
<Rle> <IRanges> <Rle> | <numeric>
[1] chr1 713822-714322 * | 179.56836
[2] chr1 752462-752962 * | 50.54511
[3] chr1 756745-757245 * | 4.1483
[4] chr1 761886-762386 * | 6.55489
[5] chr1 762661-763161 * | 128.09288
#Create Counts list
countsPeaksList <- lapply(seq_along(fragmentFiles), function(i){
message(sprintf("%s of %s", i, length(fragmentFiles)))
gc()
countInsertions(unionPeaks, readRDS(fragmentFiles[i]), by = "RG")
})#又用到了第一个函数,计算fragments 在unionpeaks里的counts
#CountsMatrix
mat <- lapply(countsPeaksList, function(x) x[[1]]) %>% Reduce("cbind",.)
frip <- lapply(countsPeaksList, function(x) x[[2]]) %>% unlist
total <- lapply(countsPeaksList, function(x) x[[3]]) %>% unlist
se <- SummarizedExperiment( #把peaks和对应的counts制作成se对象
assays = SimpleList(counts = mat),
rowRanges = unionPeaks
)
rownames(se) <- paste(seqnames(se),start(se),end(se),sep="_")
colData(se)$FRIP <- frip
colData(se)$uniqueFrags <- total / 2
#----------------------------
# Get Clusters in Peaks
#----------------------------
nTop <- 25000
nPCs1 <- 1:50
nPCs2 <- 1:50
message("Making Seurat LSI Object...") #第二次聚类,基于合并的所有的peaks
obj <- seuratLSI(assay(se), nComponents = max(nPCs1), nFeatures = NULL)
stopifnot(identical(rownames(obj@meta.data), colnames(se)))
obj@meta.data <- as.data.frame(cbind(obj@meta.data, colData(se)))
message("Adding Graph Clusters...")
obj <- FindClusters(object = obj, reduction.type = "pca", dims.use = nPCs1, print.output = TRUE, n.start = 10)
#Make Pseudo Bulk Library 每个cluster的peaks count做log变换:log-normalized using
#edgeR’s ‘cpm
mat <- assay(se)
mat@x[mat@x > 0] <- 1
clusterSums <- groupSums(mat = mat, groups = paste0("C",obj@meta.data$res.0.8), sparse = TRUE)
clusterSums[1:4,1:4]#可以看到,每个peaks在不同cluster中的计数
C1 C3 C4 C0
chr1_713822_714322 94 24 26 101
chr1_752462_752962 26 4 8 4
chr1_756745_757245 4 1 3 0
chr1_761886_762386 5 2 4 3
dim(clusterSums)#一共有132455个bins
[1] 132455 7
logMat <- edgeR::cpm(clusterSums, log = TRUE, prior.count = 3)
varPeaks <- head(order(matrixStats::rowVars(logMat), decreasing = TRUE), nTop)#提取cluster之间变异性最大的前25000个peaks
#Re-run Seurat LSI 这一次的聚类是用25000个在cluster间变异性最大的peaks做聚类
message("Making Seurat LSI Object...")
obj2 <- seuratLSI(assay(se)[varPeaks,], nComponents = max(nPCs2), nFeatures = NULL)
stopifnot(identical(rownames(obj2@meta.data), colnames(se)))
obj2@meta.data <- as.data.frame(cbind(obj2@meta.data, colData(se)))
message("Adding Graph Clusters...")
obj2 <- FindClusters(object = obj2, reduction.type = "pca", dims.use = nPCs2, print.output = TRUE, n.start = 10)
#Plot uMAP UMAP可视化,注意RunUMAP是要单独装的,而且这是一个python包,需要通过reticulate包实现交互
py_available() #结果若为FALSE则说明未安装python或python路径不在R的搜索路径中
>use_python("C:/Users/lenovo/AppData/Local/r-miniconda/envs/r-reticulate/python.exe", required = T)#指明python搜索路径
> py_install('umap-learn', pip = T, pip_ignore_installed = T)
#如果上面还不成功,参考:https://hbctraining.github.io/scRNA-seq/lessons/umap-installation.html
#报错预警,暂时我还没有解决,后期如果解决了再贴图
um$UMAP(object = obj2, reduction.use = "pca", dims.use = nPCs2)
Loading required package: Seurat
An old seurat object
100 genes across 3770 samples
Error in py_call_impl(callable, dots$args, dots$keywords) :
Evaluation error: Unable to convert R object to Python type.
message("Running UMAP")
obj2 <- RunUMAP(object = obj2, reduction.use = "pca", dims.use = nPCs2)
plotUMAP <- data.frame(GetCellEmbeddings(obj2,reduction.type="umap"), obj2@meta.data)
colnames(plotUMAP) <- c("x","y",colnames(plotUMAP)[3:ncol(plotUMAP)])
clustCol <- colnames(plotUMAP)[grep("res",colnames(plotUMAP))]
colData(se)$Clusters <- paste0("Cluster",as.integer(plotUMAP[,clustCol]) + 1)
colData(se)$UMAP1 <- plotUMAP$x
colData(se)$UMAP2 <- plotUMAP$y
pdf("results/LSI-Clustering-Peaks.pdf")
ggplot(plotUMAP, aes(x=x,y=y,color=res.0.8)) + geom_point(size = 0.5) +
theme_bw() + xlab("UMAP1") + ylab("UMAP2")
dev.off()
saveRDS(se, "results/scATAC-Summarized-Experiment.rds")
总结
Get Counts In Windows
先用countInsertions对基因组2500bp的windows内fragments进行计数,获得一个稀疏矩阵mat:每一行是基因组的一个bin(2500bp),每一列是一个细胞,用barcode表示。
Run LSI Clustering with Seurat
用seuratLSI对输入的mat利用TF-IDF结合SVD对10几万个windows进行降维,并用addCluster初步对细胞做聚类,每个cluster不少于200个细胞。
The rationale for the 200-cell cut-off was to generate an initial cell clustering to identify confident ATAC-seq peaks (using MACS2 (ref. 17)) on grouped cells.
The theoretical ideal cluster size for the purpose of peak calling is the least number of cells required to recapitulate a bulk profile. 鉴于mat的稀疏性,为了获得有效的peaks往往需要bulk数据,所以作者在这里先分出cluster, 再对每个cluster做 peaks calling
in other words, the cluster should be large enough to capture bulk peaks, but small enough to preserve rare cell type clusters and peaks
至于200这个数是怎么定下来的呢?
如上图,当每个bulk有大约200个细胞时(横坐标),最后得到的peaks和所有细胞不做cluster全部call peaks相比依然能保留70-80%peaks。如果cluster太小会丢失很多peaks,recovery率低;如果cluster太大又会导致那些相对稀有的peaks在最后筛选“25000个在cluster间变异性最大的peaks”时被忽略(而我们希望尽可能留住这种细胞特异性强的peaks来使聚类结果效果更好更有生物学意义);换个角度来讲,rare cell type有可能数量很少,如果这些细胞都被混在大的cluster里同样是不利于最后把它们区分出来的。
当然最好是对rare cell type大概数量有个先验的认识: 作者认为,如果rare cell本身数量就少于200个,那么或者增大样本量,或者在scATAC之前先对这些细胞进行富集。
If the cell type of interest is indeed less frequent than 200 cells, the number of cells sampled could be increased, or rare cells could be enriched before scATAC-seq to obtain a more accurate representation of accessible sites in this population.
Get Cluster Beds
把fragments文件里属于各个cluster的fragments提取出来制作成类似GRange对象的dataframe,输出以后就是各个cluster的bed文件
Run MACS2
对上一步得到的bed文件进行peaks calling,得到summits.Peaks
Make Non-Overlapping Peak Set
用extendedPeakSet函数获得union peaks(每个cluster的peaks分别合并取union,然后全部合成一个GRange对象),然后又用到了countInsertions,计算fragments 在unionpeaks里的counts
Peak calling for each cluster was performed independently to get high-quality, fixed-width, nonoverlapping peaks that represent the epigenetic diversity of all samples
Get Clusters in Peaks
第二次聚类,基于合并的所有的peaks,以及用25000个在cluster间变异性最大的peaks做聚类,注意到peaks的变异性需要log转换(logMat),因为:
we identified the top 25,000 varying peaks across all clusters using ‘rowVars’ in R. This was done on the cluster log-normalized matrix rather than the sparse binary matrix because: (1) it reduced biases due to cluster cell sizes, and (2) it attenuated the mean-variability relationship by converting to log space with a scaled prior count.
用cluster sums的原因是作者发现第一次的obj直接拿来做聚类,batch effects明显,所以改用obj2(Make Pseudo Bulk Library,意思就是用cluster sums来筛选高变peaks,再根据高变peaks聚类,结果更稳健)
最后UMAP展示,因为我当时run这个函数时纠缠了一段时间,暂时先用RunTSNE代替:
感兴趣的还可以把tsne和umap的结果对比一下
之后我发现RunUMAP似乎只有Seurat3才能用,而且先library的Seurat就会把另一个版本的Seurat给覆盖掉,所以我尝试先用Seurat2把obj2保存成.rds,然后重新打开用Seurat3的RunUMAP来做,但还是出了问题:
11:22:10 UMAP embedding parameters a = 0.9922 b = 1.112
11:22:10 Read rows and found numeric columns
Error in if (n_neighbors > n_vertices) { : argument is of length zero
猜测可能是旧的Seurat2对象和Seurat3不兼容造成的