GeneActivity():计算Seurat对象中的基因活性
-
transcripts
:GeneActivity最重要的作用是使用Seurat对象中的annotation生成transcripts,即拓展基因区域后的每个转录本,然后transcripts作为features作为FeatureMatrix()的输入。 -
frags
:Seurat对象中存储的fragments对象 -
cells
:所有细胞名
function (object, assay = NULL, features = NULL, extend.upstream = 2000,
extend.downstream = 0, biotypes = "protein_coding", max.width = 5e+05,
gene.id = FALSE, verbose = TRUE, ...)
{
if (!is.null(x = features)) {
if (length(x = features) == 0) {
stop("Empty list of features provided")
}
}
assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
stop("The requested assay is not a ChromatinAssay.")
}
annotation <- Annotation(object = object[[assay]])
if (length(x = annotation) == 0) {
stop("No gene annotations present in object")
}
if (verbose) {
message("Extracting gene coordinates")
}
transcripts <- CollapseToLongestTranscript(ranges = annotation)
if (gene.id) {
transcripts$gene_name <- transcripts$gene_id
}
if (!is.null(x = biotypes)) {
transcripts <- transcripts[transcripts$gene_biotype %in%
biotypes]
if (length(x = transcripts) == 0) {
stop("No genes remaining after filtering for requested biotypes")
}
}
if (!is.null(x = features)) {
transcripts <- transcripts[transcripts$gene_name %in%
features]
if (length(x = transcripts) == 0) {
stop("None of the requested genes were found in the gene annotation")
}
}
if (!is.null(x = max.width)) {
transcript.keep <- which(x = width(x = transcripts) <
max.width)
transcripts <- transcripts[transcript.keep]
if (length(x = transcripts) == 0) {
stop("No genes remaining after filtering for max.width")
}
}
transcripts <- Extend(x = transcripts, upstream = extend.upstream,
downstream = extend.downstream)
frags <- Fragments(object = object[[assay]])
if (length(x = frags) == 0) {
stop("No fragment information found for requested assay")
}
cells <- colnames(x = object[[assay]])
###################################################################################################
counts <- FeatureMatrix(fragments = frags, features = transcripts,
cells = cells, verbose = verbose, ...)
###################################################################################################
transcripts$gene_name <- ifelse(test = is.na(x = transcripts$gene_name),
yes = transcripts$gene_id, no = transcripts$gene_name)
gene.key <- transcripts$gene_name
names(x = gene.key) <- GRangesToString(grange = transcripts)
rownames(x = counts) <- as.vector(x = gene.key[rownames(x = counts)])
counts <- counts[rownames(x = counts) != "", ]
return(counts)
}
重要函数:FeatureMatrix()
FeatureMatrix(): 从多个fragments文件中提取特征矩阵
- 这个函数的作用主要是对fragments中的每个frag并行化运行SingleFeatureMatrix()
function (fragments, features, cells = NULL, process_n = 2000,
sep = c("-", "-"), verbose = TRUE)
{
if (!inherits(x = features, what = "GRanges")) {
if (inherits(x = features, what = "character")) {
features <- StringToGRanges(regions = features, sep = sep)
}
else {
stop("features should be a GRanges object")
}
}
if (!inherits(x = fragments, what = "list")) {
if (inherits(x = fragments, what = "Fragment")) {
fragments <- list(fragments)
}
else {
stop("fragments should be a list of Fragment objects")
}
}
if (!is.null(x = cells)) {
obj.use <- c()
for (i in seq_along(along.with = fragments)) {
if (is.null(x = Cells(fragments[[i]]))) {
obj.use <- c(obj.use, i)
}
else if (any(cells %in% Cells(x = fragments[[i]]))) {
obj.use <- c(obj.use, i)
}
}
}
else {
obj.use <- seq_along(along.with = fragments)
}
###################################################################################################
mat.list <- sapply(X = obj.use, FUN = function(x) {
SingleFeatureMatrix(fragment = fragments[[x]], features = features,
cells = cells, sep = sep, verbose = verbose, process_n = process_n)
})
###################################################################################################
if (length(x = mat.list) == 1) {
return(mat.list[[1]])
}
else {
featmat <- Reduce(f = RowMergeSparseMatrices, x = mat.list)
return(featmat)
}
}
重要函数:Signac:::SingleFeatureMatrix()
Signac:::SingleFeatureMatrix():获取单个fragment文件的特征矩阵
-
tbx
:fragment文件的索引文件 -
feature.list
:将feature(transcripts的GRanges对象)分成多个chunks的list对象,后续作为PartialMatrix()的regions参数进行并行化计算
function (fragment, features, cells = NULL, process_n = 2000,
sep = c("-", "-"), verbose = TRUE)
{
fragment.path <- GetFragmentData(object = fragment, slot = "path")
if (!is.null(cells)) {
frag.cells <- GetFragmentData(object = fragment, slot = "cells")
if (is.null(x = frag.cells)) {
names(x = cells) <- cells
}
else {
cell.idx <- fmatch(x = names(x = frag.cells), table = cells,
nomatch = 0L) > 0
cells <- frag.cells[cell.idx]
}
}
tbx <- TabixFile(file = fragment.path)
features <- keepSeqlevels(x = features, value = intersect(x = seqnames(x = features),
y = seqnamesTabix(file = tbx)), pruning.mode = "coarse")
if (length(x = features) == 0) {
stop("No matching chromosomes found in fragment file.")
}
feature.list <- ChunkGRanges(granges = features, nchunk = ceiling(x = length(x = features)/process_n))
# 将feature(transcripts的GRanges对象)分成多个chunks
if (verbose) {
message("Extracting reads overlapping genomic regions")
}
if (nbrOfWorkers() > 1) {
matrix.parts <- future_lapply(X = feature.list, FUN = PartialMatrix,
tabix = tbx, cells = cells, sep = sep, future.globals = list(),
future.scheduling = FALSE)
}
else {
mylapply <- ifelse(test = verbose, yes = pblapply, no = lapply) # 有无progress bar的lapply,这种技巧可以学习
###################################################################################################
matrix.parts <- mylapply(X = feature.list, FUN = PartialMatrix,
tabix = tbx, cells = cells, sep = sep)
###################################################################################################
}
null.parts <- sapply(X = matrix.parts, FUN = is.null)
matrix.parts <- matrix.parts[!null.parts]
if (is.null(x = cells)) {
all.cells <- unique(x = unlist(x = lapply(X = matrix.parts,
FUN = colnames)))
matrix.parts <- lapply(X = matrix.parts, FUN = AddMissingCells,
cells = all.cells)
}
featmat <- do.call(what = rbind, args = matrix.parts) # do.call():todo
if (!is.null(x = cells)) {
cell.convert <- names(x = cells)
names(x = cell.convert) <- cells
colnames(x = featmat) <- unname(obj = cell.convert[colnames(x = featmat)])
}
feat.str <- GRangesToString(grange = features, sep = sep)
featmat <- featmat[feat.str, , drop = FALSE]
return(featmat)
}
重要函数:PartialMatrix()
Signac:::PartialMatrix():在fragment的tabix文件中搜索特征对应的细胞,构成counts矩阵
-
cells.in.regions
:一个list,里面为每个feature(在这里为基因的坐标)所包含的细胞有哪些
-
cells.in.regions
后来又经过unlist()和uname()变为最终counts稀疏矩阵的列 -
feature.vec
:每个特征在多少个细胞中有,构成counts稀疏矩阵的行
function (tabix, regions, sep = c("-", "-"), cells = NULL)
{
open(con = tabix)
###################################################################################################
cells.in.regions <- GetCellsInRegion(tabix = tabix, region = regions,
cells = cells)
###################################################################################################
close(con = tabix)
gc(verbose = FALSE)
nrep <- elementNROWS(x = cells.in.regions) # 每个feature在多少个细胞中有
if (all(nrep == 0) & !is.null(x = cells)) {
featmat <- sparseMatrix(dims = c(length(x = regions),
length(x = cells)), i = NULL, j = NULL)
rownames(x = featmat) <- GRangesToString(grange = regions)
colnames(x = featmat) <- cells
featmat <- as(object = featmat, Class = "dgCMatrix")
return(featmat)
}
else if (all(nrep == 0)) {
featmat <- sparseMatrix(dims = c(length(x = regions),
0), i = NULL, j = NULL)
rownames(x = featmat) <- GRangesToString(grange = regions)
featmat <- as(object = featmat, Class = "dgCMatrix")
return(featmat)
}
else {
if (is.null(x = cells)) {
all.cells <- unique(x = unlist(x = cells.in.regions))
cell.lookup <- seq_along(along.with = all.cells)
names(x = cell.lookup) <- all.cells
}
else {
cell.lookup <- seq_along(along.with = cells)
names(cell.lookup) <- cells
}
cells.in.regions <- unlist(x = cells.in.regions)
cells.in.regions <- unname(obj = cell.lookup[cells.in.regions])
all.features <- GRangesToString(grange = regions, sep = sep)
feature.vec <- rep(x = seq_along(along.with = all.features),
nrep) # 稀疏矩阵的x
###################################################################################################
featmat <- sparseMatrix(i = feature.vec, j = cells.in.regions,
x = rep(x = 1, length(x = cells.in.regions)))
###################################################################################################
featmat <- as(Class = "dgCMatrix", object = featmat)
rownames(x = featmat) <- all.features[1:max(feature.vec)]
colnames(x = featmat) <- names(x = cell.lookup)[1:max(cells.in.regions)]
if (!is.null(x = cells)) {
featmat <- AddMissingCells(x = featmat, cells = cells)
}
missing.features <- all.features[!(all.features %in%
rownames(x = featmat))]
if (length(x = missing.features) > 0) {
null.mat <- sparseMatrix(i = c(), j = c(), dims = c(length(x = missing.features),
ncol(x = featmat)))
rownames(x = null.mat) <- missing.features
null.mat <- as(object = null.mat, Class = "dgCMatrix")
featmat <- rbind(featmat, null.mat)
}
return(featmat)
}
}
重要函数:GetCellsInRegion()
GetCellsInRegion():在fragment文件中搜索特定基因组区域对应的细胞
-
reads <- scanTabix(file = tabix, param = region)
:reads
为列表,每个元素是一个基因,里面包含与这个基因有交集的所有细胞的fragments,格式如下:
-
reads <- lapply(X = reads, FUN = ExtractCell)
:reads
为列表,每个元素是一个基因,里面包含与这个基因有交集的所有细胞,这些细胞有重复的。
最后
reads
再经过细胞的筛选,得到最终所提供region中所包含的细胞,这些细胞作为稀疏矩阵的列,重复的细胞会出现多次,在稀疏矩阵中的第三列counts均为1
function (tabix, region, cells = NULL)
{
if (!is(object = region, class2 = "GRanges")) {
region <- StringToGRanges(regions = region)
}
reads <- scanTabix(file = tabix, param = region) # region\t对应的细胞,字符串格式
reads <- lapply(X = reads, FUN = ExtractCell) # 将字符串用\t分割,提取细胞名
if (!is.null(x = cells)) {
reads <- sapply(X = reads, FUN = function(x) {
x <- x[fmatch(x = x, table = cells, nomatch = 0L) >
0L] # 只提取在所提供的细胞里面有的细胞(fragments文件里面存在很多最终peak counts矩阵里没有的细胞)
if (length(x = x) == 0) {
return(NULL)
}
else {
return(x)
}
}, simplify = FALSE)
}
return(reads)
}
解读完毕,真不容易!!
最后附上每一步的测试代码:
###### GeneActivity() ######
transcripts <- Signac:::CollapseToLongestTranscript(ranges = Annotation(pbmc.ATAC))
transcripts[which(transcripts$gene_name == 'Pag1'),]
transcripts <- Extend(x = transcripts, upstream = 2000, downstream = 0)
transcripts[which(transcripts$gene_name == 'Pag1'),]
frags <- Fragments(pbmc.ATAC)
cells <- colnames(x = pbmc.ATAC)
# counts <- FeatureMatrix(fragments = frags, features = transcripts, cells = cells)
###### FeatureMatrix() ######
obj.use <- c()
for (i in seq_along(along.with = frags)) {
if (is.null(x = Cells(frags[[i]]))) {
obj.use <- c(obj.use, i)
}
else if (any(cells %in% Cells(x = frags[[i]]))) {
obj.use <- c(obj.use, i)
}
}
obj.use
mat.list <- sapply(X = 1, FUN = function(x) {
Signac:::SingleFeatureMatrix(fragment = frags[[1]], features = transcripts,
cells = cells, sep = c("-", "-"), process_n = 2000)
})
###### SingleFeatureMatrix() ######
tbx <- Rsamtools::TabixFile(file = frags[[1]]@path)
feature.list <- Signac:::ChunkGRanges(granges = transcripts, nchunk = ceiling(x = length(x = transcripts)/2000))
###### PartialMatrix() ######
open(con = tbx)
cells.in.regions <- GetCellsInRegion(tabix = tbx, region = feature.list[[1]], cells = cells)
close(con = tabix)
nrep <- S4Vectors::elementNROWS(x = cells.in.regions)
cell.lookup <- seq_along(along.with = cells)
names(cell.lookup) <- cells
cells.in.regions <- unlist(x = cells.in.regions)
cells.in.regions <- unname(obj = cell.lookup[cells.in.regions])
all.features <- GRangesToString(grange = feature.list[[1]], sep = c("-", "-"))
feature.vec <- rep(x = seq_along(along.with = all.features), nrep)
featmat <- Matrix::sparseMatrix(i = feature.vec, j = cells.in.regions,
x = rep(x = 1, length(x = cells.in.regions)))
###### GetCellsInRegion() ######
reads <- Rsamtools::scanTabix(file = tbx, param = feature.list[[1]])
reads <- lapply(X = reads, FUN = Signac:::ExtractCell)
if (!is.null(x = cells)) {
reads <- sapply(X = reads, FUN = function(x) {
x <- x[fastmatch::fmatch(x = x, table = cells, nomatch = 0L) >
0L]
if (length(x = x) == 0) {
return(NULL)
}
else {
return(x)
}
}, simplify = FALSE)
}