单细胞笔记14-Signac::GeneActivity()函数源码解读

GeneActivity():计算Seurat对象中的基因活性

  • transcripts:GeneActivity最重要的作用是使用Seurat对象中的annotation生成transcripts,即拓展基因区域后的每个转录本,然后transcripts作为features作为FeatureMatrix()的输入。
  • frags:Seurat对象中存储的fragments对象
  • cells:所有细胞名
function (object, assay = NULL, features = NULL, extend.upstream = 2000, 
    extend.downstream = 0, biotypes = "protein_coding", max.width = 5e+05, 
    gene.id = FALSE, verbose = TRUE, ...) 
{
    if (!is.null(x = features)) {
        if (length(x = features) == 0) {
            stop("Empty list of features provided")
        }
    }
    assay <- SetIfNull(x = assay, y = DefaultAssay(object = object))
    if (!inherits(x = object[[assay]], what = "ChromatinAssay")) {
        stop("The requested assay is not a ChromatinAssay.")
    }
    annotation <- Annotation(object = object[[assay]])
    if (length(x = annotation) == 0) {
        stop("No gene annotations present in object")
    }
    if (verbose) {
        message("Extracting gene coordinates")
    }
    transcripts <- CollapseToLongestTranscript(ranges = annotation)
    if (gene.id) {
        transcripts$gene_name <- transcripts$gene_id
    }
    if (!is.null(x = biotypes)) {
        transcripts <- transcripts[transcripts$gene_biotype %in% 
            biotypes]
        if (length(x = transcripts) == 0) {
            stop("No genes remaining after filtering for requested biotypes")
        }
    }
    if (!is.null(x = features)) {
        transcripts <- transcripts[transcripts$gene_name %in% 
            features]
        if (length(x = transcripts) == 0) {
            stop("None of the requested genes were found in the gene annotation")
        }
    }
    if (!is.null(x = max.width)) {
        transcript.keep <- which(x = width(x = transcripts) < 
            max.width)
        transcripts <- transcripts[transcript.keep]
        if (length(x = transcripts) == 0) {
            stop("No genes remaining after filtering for max.width")
        }
    }
    transcripts <- Extend(x = transcripts, upstream = extend.upstream, 
        downstream = extend.downstream)
    frags <- Fragments(object = object[[assay]])
    if (length(x = frags) == 0) {
        stop("No fragment information found for requested assay")
    }
    cells <- colnames(x = object[[assay]])
###################################################################################################
    counts <- FeatureMatrix(fragments = frags, features = transcripts, 
        cells = cells, verbose = verbose, ...)
###################################################################################################
    transcripts$gene_name <- ifelse(test = is.na(x = transcripts$gene_name), 
        yes = transcripts$gene_id, no = transcripts$gene_name)
    gene.key <- transcripts$gene_name
    names(x = gene.key) <- GRangesToString(grange = transcripts)
    rownames(x = counts) <- as.vector(x = gene.key[rownames(x = counts)])
    counts <- counts[rownames(x = counts) != "", ]
    return(counts)
}

重要函数:FeatureMatrix()


FeatureMatrix(): 从多个fragments文件中提取特征矩阵

  • 这个函数的作用主要是对fragments中的每个frag并行化运行SingleFeatureMatrix()
function (fragments, features, cells = NULL, process_n = 2000, 
    sep = c("-", "-"), verbose = TRUE) 
{
    if (!inherits(x = features, what = "GRanges")) {
        if (inherits(x = features, what = "character")) {
            features <- StringToGRanges(regions = features, sep = sep)
        }
        else {
            stop("features should be a GRanges object")
        }
    }
    if (!inherits(x = fragments, what = "list")) {
        if (inherits(x = fragments, what = "Fragment")) {
            fragments <- list(fragments)
        }
        else {
            stop("fragments should be a list of Fragment objects")
        }
    }
    if (!is.null(x = cells)) {
        obj.use <- c()
        for (i in seq_along(along.with = fragments)) {
            if (is.null(x = Cells(fragments[[i]]))) {
                obj.use <- c(obj.use, i)
            }
            else if (any(cells %in% Cells(x = fragments[[i]]))) {
                obj.use <- c(obj.use, i)
            }
        }
    }
    else {
        obj.use <- seq_along(along.with = fragments)
    }
###################################################################################################
    mat.list <- sapply(X = obj.use, FUN = function(x) {
        SingleFeatureMatrix(fragment = fragments[[x]], features = features, 
            cells = cells, sep = sep, verbose = verbose, process_n = process_n)
    })
###################################################################################################
    if (length(x = mat.list) == 1) {
        return(mat.list[[1]])
    }
    else {
        featmat <- Reduce(f = RowMergeSparseMatrices, x = mat.list)
        return(featmat)
    }
}

重要函数:Signac:::SingleFeatureMatrix()


Signac:::SingleFeatureMatrix():获取单个fragment文件的特征矩阵

  • tbx:fragment文件的索引文件
  • feature.list:将feature(transcripts的GRanges对象)分成多个chunks的list对象,后续作为PartialMatrix()的regions参数进行并行化计算
function (fragment, features, cells = NULL, process_n = 2000, 
    sep = c("-", "-"), verbose = TRUE) 
{
    fragment.path <- GetFragmentData(object = fragment, slot = "path")
    if (!is.null(cells)) {
        frag.cells <- GetFragmentData(object = fragment, slot = "cells")
        if (is.null(x = frag.cells)) {
            names(x = cells) <- cells
        }
        else {
            cell.idx <- fmatch(x = names(x = frag.cells), table = cells, 
                nomatch = 0L) > 0
            cells <- frag.cells[cell.idx]
        }
    }
    tbx <- TabixFile(file = fragment.path)
    features <- keepSeqlevels(x = features, value = intersect(x = seqnames(x = features), 
        y = seqnamesTabix(file = tbx)), pruning.mode = "coarse")
    if (length(x = features) == 0) {
        stop("No matching chromosomes found in fragment file.")
    }
    feature.list <- ChunkGRanges(granges = features, nchunk = ceiling(x = length(x = features)/process_n))    
    # 将feature(transcripts的GRanges对象)分成多个chunks
    if (verbose) {
        message("Extracting reads overlapping genomic regions")
    }
    if (nbrOfWorkers() > 1) {
        matrix.parts <- future_lapply(X = feature.list, FUN = PartialMatrix, 
            tabix = tbx, cells = cells, sep = sep, future.globals = list(), 
            future.scheduling = FALSE)
    }
    else {
        mylapply <- ifelse(test = verbose, yes = pblapply, no = lapply)    # 有无progress bar的lapply,这种技巧可以学习
###################################################################################################
        matrix.parts <- mylapply(X = feature.list, FUN = PartialMatrix, 
            tabix = tbx, cells = cells, sep = sep)
###################################################################################################
    }
    null.parts <- sapply(X = matrix.parts, FUN = is.null)
    matrix.parts <- matrix.parts[!null.parts]
    if (is.null(x = cells)) {
        all.cells <- unique(x = unlist(x = lapply(X = matrix.parts, 
            FUN = colnames)))
        matrix.parts <- lapply(X = matrix.parts, FUN = AddMissingCells, 
            cells = all.cells)
    }
    featmat <- do.call(what = rbind, args = matrix.parts)    # do.call():todo
    if (!is.null(x = cells)) {
        cell.convert <- names(x = cells)
        names(x = cell.convert) <- cells
        colnames(x = featmat) <- unname(obj = cell.convert[colnames(x = featmat)])
    }
    feat.str <- GRangesToString(grange = features, sep = sep)
    featmat <- featmat[feat.str, , drop = FALSE]
    return(featmat)
}

重要函数:PartialMatrix()


Signac:::PartialMatrix():在fragment的tabix文件中搜索特征对应的细胞,构成counts矩阵

  • cells.in.regions:一个list,里面为每个feature(在这里为基因的坐标)所包含的细胞有哪些
    cells.in.regions
  • cells.in.regions后来又经过unlist()和uname()变为最终counts稀疏矩阵的
  • feature.vec:每个特征在多少个细胞中有,构成counts稀疏矩阵的
function (tabix, regions, sep = c("-", "-"), cells = NULL) 
{
    open(con = tabix)
###################################################################################################
    cells.in.regions <- GetCellsInRegion(tabix = tabix, region = regions, 
        cells = cells)
###################################################################################################
    close(con = tabix)
    gc(verbose = FALSE)
    nrep <- elementNROWS(x = cells.in.regions)    # 每个feature在多少个细胞中有
    if (all(nrep == 0) & !is.null(x = cells)) {
        featmat <- sparseMatrix(dims = c(length(x = regions), 
            length(x = cells)), i = NULL, j = NULL)
        rownames(x = featmat) <- GRangesToString(grange = regions)
        colnames(x = featmat) <- cells
        featmat <- as(object = featmat, Class = "dgCMatrix")
        return(featmat)
    }
    else if (all(nrep == 0)) {
        featmat <- sparseMatrix(dims = c(length(x = regions), 
            0), i = NULL, j = NULL)
        rownames(x = featmat) <- GRangesToString(grange = regions)
        featmat <- as(object = featmat, Class = "dgCMatrix")
        return(featmat)
    }
    else {
        if (is.null(x = cells)) {
            all.cells <- unique(x = unlist(x = cells.in.regions))
            cell.lookup <- seq_along(along.with = all.cells)
            names(x = cell.lookup) <- all.cells
        }
        else {
            cell.lookup <- seq_along(along.with = cells)
            names(cell.lookup) <- cells
        }
        cells.in.regions <- unlist(x = cells.in.regions)
        cells.in.regions <- unname(obj = cell.lookup[cells.in.regions])
        all.features <- GRangesToString(grange = regions, sep = sep)
        feature.vec <- rep(x = seq_along(along.with = all.features), 
            nrep)    # 稀疏矩阵的x
###################################################################################################
        featmat <- sparseMatrix(i = feature.vec, j = cells.in.regions, 
            x = rep(x = 1, length(x = cells.in.regions)))
###################################################################################################
        featmat <- as(Class = "dgCMatrix", object = featmat)
        rownames(x = featmat) <- all.features[1:max(feature.vec)]
        colnames(x = featmat) <- names(x = cell.lookup)[1:max(cells.in.regions)]
        if (!is.null(x = cells)) {
            featmat <- AddMissingCells(x = featmat, cells = cells)
        }
        missing.features <- all.features[!(all.features %in% 
            rownames(x = featmat))]
        if (length(x = missing.features) > 0) {
            null.mat <- sparseMatrix(i = c(), j = c(), dims = c(length(x = missing.features), 
                ncol(x = featmat)))
            rownames(x = null.mat) <- missing.features
            null.mat <- as(object = null.mat, Class = "dgCMatrix")
            featmat <- rbind(featmat, null.mat)
        }
        return(featmat)
    }
}

重要函数:GetCellsInRegion()


GetCellsInRegion():在fragment文件中搜索特定基因组区域对应的细胞

  • reads <- scanTabix(file = tabix, param = region)reads为列表,每个元素是一个基因,里面包含与这个基因有交集的所有细胞的fragments,格式如下:

    reads <- Rsamtools::scanTabix(file = tbx, param = feature.list[[1]])

  • reads <- lapply(X = reads, FUN = ExtractCell)reads为列表,每个元素是一个基因,里面包含与这个基因有交集的所有细胞,这些细胞有重复的

    reads <- lapply(X = reads, FUN = ExtractCell)

  • 最后reads再经过细胞的筛选,得到最终所提供region中所包含的细胞,这些细胞作为稀疏矩阵的列,重复的细胞会出现多次,在稀疏矩阵中的第三列counts均为1

function (tabix, region, cells = NULL) 
{
    if (!is(object = region, class2 = "GRanges")) {
        region <- StringToGRanges(regions = region)
    }
    reads <- scanTabix(file = tabix, param = region)    # region\t对应的细胞,字符串格式
    reads <- lapply(X = reads, FUN = ExtractCell)    # 将字符串用\t分割,提取细胞名
    if (!is.null(x = cells)) {
        reads <- sapply(X = reads, FUN = function(x) {
            x <- x[fmatch(x = x, table = cells, nomatch = 0L) > 
                0L]    # 只提取在所提供的细胞里面有的细胞(fragments文件里面存在很多最终peak counts矩阵里没有的细胞)
            if (length(x = x) == 0) {
                return(NULL)
            }
            else {
                return(x)
            }
        }, simplify = FALSE)
    }
    return(reads)
}


解读完毕,真不容易!!

最后附上每一步的测试代码:

###### GeneActivity() ######
transcripts <- Signac:::CollapseToLongestTranscript(ranges = Annotation(pbmc.ATAC))
transcripts[which(transcripts$gene_name == 'Pag1'),]
transcripts <- Extend(x = transcripts, upstream = 2000, downstream = 0)
transcripts[which(transcripts$gene_name == 'Pag1'),]
frags <- Fragments(pbmc.ATAC)
cells <- colnames(x = pbmc.ATAC)
# counts <- FeatureMatrix(fragments = frags, features = transcripts, cells = cells)


###### FeatureMatrix() ######
obj.use <- c()
for (i in seq_along(along.with = frags)) {
  if (is.null(x = Cells(frags[[i]]))) {
    obj.use <- c(obj.use, i)
  }
  else if (any(cells %in% Cells(x = frags[[i]]))) {
    obj.use <- c(obj.use, i)
  }
}
obj.use
mat.list <- sapply(X = 1, FUN = function(x) {
  Signac:::SingleFeatureMatrix(fragment = frags[[1]], features = transcripts, 
                      cells = cells, sep = c("-", "-"), process_n = 2000)
})


###### SingleFeatureMatrix() ######
tbx <- Rsamtools::TabixFile(file = frags[[1]]@path)
feature.list <- Signac:::ChunkGRanges(granges = transcripts, nchunk = ceiling(x = length(x = transcripts)/2000))


###### PartialMatrix() ######
open(con = tbx)
cells.in.regions <- GetCellsInRegion(tabix = tbx, region = feature.list[[1]], cells = cells)
close(con = tabix)
nrep <- S4Vectors::elementNROWS(x = cells.in.regions)
cell.lookup <- seq_along(along.with = cells)
names(cell.lookup) <- cells
cells.in.regions <- unlist(x = cells.in.regions)
cells.in.regions <- unname(obj = cell.lookup[cells.in.regions])
all.features <- GRangesToString(grange = feature.list[[1]], sep = c("-", "-"))
feature.vec <- rep(x = seq_along(along.with = all.features), nrep)
featmat <- Matrix::sparseMatrix(i = feature.vec, j = cells.in.regions, 
                                x = rep(x = 1, length(x = cells.in.regions)))


###### GetCellsInRegion() ######
reads <- Rsamtools::scanTabix(file = tbx, param = feature.list[[1]])
reads <- lapply(X = reads, FUN = Signac:::ExtractCell)
if (!is.null(x = cells)) {
  reads <- sapply(X = reads, FUN = function(x) {
    x <- x[fastmatch::fmatch(x = x, table = cells, nomatch = 0L) > 
             0L]
    if (length(x = x) == 0) {
      return(NULL)
    }
    else {
      return(x)
    }
  }, simplify = FALSE)
}
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 216,163评论 6 498
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 92,301评论 3 392
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 162,089评论 0 352
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,093评论 1 292
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,110评论 6 388
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,079评论 1 295
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,005评论 3 417
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,840评论 0 273
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,278评论 1 310
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,497评论 2 332
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,667评论 1 348
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,394评论 5 343
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,980评论 3 325
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,628评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,796评论 1 268
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,649评论 2 368
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,548评论 2 352

推荐阅读更多精彩内容