FindAllMarkers, FindMarkers 以及 FindConservedMarkers 的区别

在seurat中，如果运行了RunUMAP或者RunTSNE后自动分群后，FindAllMarkers和FindMarkers基本就是一样的；如果没有进行RunUMAP或者RunTSNE分群，那么需要先运行BuildClusterTree(object)函数，利用树聚类先分群

FindAllMarkers and FindMarkers

示例：
首先加载数据

pbmc.data <- read.csv("GSE117988_raw.expMatrix_PBMC.csv",header=TRUE,row.names = 1)
pbmc.data <- log2(1 + sweep(pbmc.data, 2, median(colSums(pbmc.data))/colSums(pbmc.data), '*'))
pbmc<- CreateSeuratObject(counts = pbmc.data, project = "10X pbmc", min.cells = 1, min.features = 0)

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

sce=pbmc
sce <- NormalizeData(sce, normalization.method =  "LogNormalize", scale.factor = 10000)
GetAssay(sce,assay = "RNA")
sce <- FindVariableFeatures(sce, selection.method = "vst", nfeatures = 2000) 

sce <- ScaleData(sce) 
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce)) 
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.2)
table(sce@meta.data$RNA_snn_res.0.2) 

set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE)
## 运行 FindAllMarkers
all.markers <- FindAllMarkers(object = sce)

先看FindAllMarkers：

object = sce
assay = NULL
features = NULL
logfc.threshold = 0.25
test.use = 'wilcox'
slot = 'data'
min.pct = 0.1
min.diff.pct = -Inf
node = NULL
verbose = TRUE
only.pos = FALSE
max.cells.per.ident = Inf
random.seed = 1
latent.vars = NULL
min.cells.feature = 3
min.cells.group = 3
mean.fxn = NULL
fc.name = NULL
base = 2
return.thresh = 1e-2
densify = FALSE

if (is.null(x = node)) {
  ## 将分群的结果进行排序
  idents.all <- sort(x = unique(x = Idents(object = object)))
  ## 一共 7 个类群
  ##[1] 0 1 2 3 4 5 6
  ##Levels: 0 1 2 3 4 5 6
} 

genes.de <- list()
messages <- list()

## 分别遍历每一个细胞类群( 0 1 2 3 4 5 6 )
for (i in 1:length(x = idents.all)) {
  if (verbose) {
    message("Calculating cluster ", idents.all[i])
  }
  genes.de[[i]] <- tryCatch(
    expr = {
      FindMarkers(
        object = object,
        assay = assay,
        ident.1 = if (is.null(x = node)) {
          idents.all[i]
        } else {
          tree
        },
        ident.2 = if (is.null(x = node)) {
          NULL
        } else {
          idents.all[i]
        },
        features = features,
        logfc.threshold = logfc.threshold,
        test.use = test.use,
        slot = slot,
        min.pct = min.pct,
        min.diff.pct = min.diff.pct,
        verbose = verbose,
        only.pos = only.pos,
        max.cells.per.ident = max.cells.per.ident,
        random.seed = random.seed,
        latent.vars = latent.vars,
        min.cells.feature = min.cells.feature,
        min.cells.group = min.cells.group,
        mean.fxn = mean.fxn,
        fc.name = fc.name,
        base = base,
        densify = densify
      )
    },
    error = function(cond) {
      return(cond$message)
    }
  )
}
gde.all <- data.frame()

# 对每一类群找到的 marker 基因进行阈值过滤
for (i in 1:length(x = idents.all)) {
  gde <- genes.de[[i]]
  gde <- gde[order(gde$p_val, -gde[, 2]), ]
  gde <- subset(x = gde, subset = p_val < return.thresh)
  gde$cluster <- idents.all[i]
  gde$gene <- rownames(x = gde)
  gde.all <- rbind(gde.all, gde)

# 重命名  
rownames(x = gde.all) <- make.unique(names = as.character(x = gde.all$gene))

我们可以知道，当运行完RunUMAP或者RunTSNE后自动分群后，FindAllMarkers 内部其实就是调用了 FindMarkers，并且：

# FindMarkers 的 ident.1 为RunUMAP或者RunTSNE后自动分群后每一个类群的细胞
ident.1 = if (is.null(x = node)) 
{
    idents.all[i]
} 

# FindMarkers 的 ident.2 默认为空 
ident.2 = if (is.null(x = node))
{
     NULL
}

而 FindMarkers 在计算差异基因的时候需要先运行 IdentsToCells() 函数

IdentsToCells <- function(
  object,
  ident.1,
  ident.2,
  cellnames.use
) {
  #
  if (is.null(x = ident.1)) {
    stop("Please provide ident.1")
  } else if ((length(x = ident.1) == 1 && ident.1[1] == 'clustertree') || is(object = ident.1, class2 = 'phylo')) {
    if (is.null(x = ident.2)) {
      stop("Please pass a node to 'ident.2' to run FindMarkers on a tree")
    }
    tree <- if (is(object = ident.1, class2 = 'phylo')) {
      ident.1
    } else {
      Tool(object = object, slot = 'BuildClusterTree')
    }
    if (is.null(x = tree)) {
      stop("Please run 'BuildClusterTree' or pass an object of class 'phylo' as 'ident.1'")
    }
    ident.1 <- tree$tip.label[GetLeftDescendants(tree = tree, node = ident.2)]
    ident.2 <- tree$tip.label[GetRightDescendants(tree = tree, node = ident.2)]
  }
  if (length(x = as.vector(x = ident.1)) > 1 &&
      any(as.character(x = ident.1) %in% cellnames.use)) {
    bad.cells <- cellnames.use[which(x = !as.character(x = ident.1) %in% cellnames.use)]
    if (length(x = bad.cells) > 0) {
      stop(paste0("The following cell names provided to ident.1 are not present in the object: ", paste(bad.cells, collapse = ", ")))
    }
  } else {
    ident.1 <- WhichCells(object = object, idents = ident.1)
  }
  # if NULL for ident.2, use all other cells
  if (length(x = as.vector(x = ident.2)) > 1 &&
      any(as.character(x = ident.2) %in% cellnames.use)) {
    bad.cells <- cellnames.use[which(!as.character(x = ident.2) %in% cellnames.use)]
    if (length(x = bad.cells) > 0) {
      stop(paste0("The following cell names provided to ident.2 are not present in the object: ", paste(bad.cells, collapse = ", ")))
    }
  } else {
    if (is.null(x = ident.2)) {
      ident.2 <- setdiff(x = cellnames.use, y = ident.1)
    } else {
      ident.2 <- WhichCells(object = object, idents = ident.2)
    }
  }
  return(list(cells.1 = ident.1, cells.2 = ident.2))
}

对于 FindMarkers 若 ident.2 = NULL ，那么 IdentsToCells() 函数将会选择除 ident.1 类群以外的剩下所有细胞

# select which data to use
assay <- assay %||% DefaultAssay(object = sce)
data.use <- object[[assay]]
# cellnames.use 代表所有的细胞
cellnames.use <-  colnames(x = data.use)

 # if NULL for ident.2, use all other cells
 if (is.null(x = ident.2)) {
      ## 选择除 ident.1 类群以外的其他所有细胞，setdiff 为取差集
      ident.2 <- setdiff(x = cellnames.use, y = ident.1)
 } else {
      ident.2 <- WhichCells(object = object, idents = ident.2)
   }
}

因此，对于 FindMarkers 若 ident.2 = NULL ，计算的是 ident.1 细胞类群与除 ident.1 类群以外的剩下所有细胞相比的差异基因；如果 ident.2 = 某一细胞类群 那么计算的是 ident.1 细胞类群与ident.2 类群相比的差异基因

FindConservedMarkers

# 安装必要的包
BiocManager::install('multtest')
install.packages('metap')
install.packages('sn')
library(Seurat)

# 加载示例数据
data("pbmc_small")
# 假设所有细胞一共分为 'g1' 和 'g2' 两种细胞类群
pbmc_small[['groups']] <- sample(x = c('g1', 'g2'), size = ncol(x = pbmc_small), replace = TRUE)
FindConservedMarkers(pbmc_small, ident.1 = 0, ident.2 = 1, grouping.var = "groups")

pbmc_small[['groups']]
> pbmc_small[['groups']]
               groups
ATGCCAGAACGACT     g1
CATGGCCTGTGCAT     g2
GAACCTGATGAACC     g1
TGACTGGATTCTCA     g1
AGTCAGACTGCACA     g2
TCTGATACACGTGT     g1
TGGTATCTAAACAG     g1
GCAGCTCTGTTTCT     g1
GATATAACACGCAT     g2
AATGTTGACAGTCA     g2


### FindConservedMarkers 源代码分析
object = pbmc_small
ident.1 = 0
ident.2 = 1
grouping.var = "groups"
assay = 'RNA'
slot = 'data'
min.cells.group = 3
meta.method = metap::minimump
verbose = TRUE

# 假设有两个分组, 分别为 'g1' 和 'g2' 
# ident.1 和 ident.2 为每一组('g1' 和 'g2') 的细胞分群, ident.1 = 0, ident.2 = 1
# 提取每个细胞对应分组的信息,即每个细胞分为 'g1' 还是 'g2'
object.var <- FetchData(object = object, vars = grouping.var)
# 提取 'g1' 和 'g2' 的细胞
object <- SetIdent(
  object = object,
  cells = colnames(x = object),
  value = paste(Idents(object = object), object.var[, 1], sep = "_")
)
levels.split <- names(x = sort(x = table(object.var[, 1])))
num.groups <- length(levels.split)
cells <- list()

# 分别将 'g1' 和 'g2' 的细胞单独取出来,  cells[[1]] 为 'g1' 的细胞, cells[[2]] 为 'g2' 的细胞
for (i in 1:num.groups) {
  cells[[i]] <- rownames(
    x = object.var[object.var[, 1] == levels.split[i], , drop = FALSE]
  )
}
marker.test <- list()
# do marker tests
ident.2.save <- ident.2
# 分别遍历 'g1' 和 'g2' 
for (i in 1:num.groups) {
  level.use <- levels.split[i]
  ident.use.1 <- paste(ident.1, level.use, sep = "_")
  ident.use.1.exists <- ident.use.1 %in% Idents(object = object)

  ident.2 <- ident.2.save
  cells.1 <- WhichCells(object = object, idents = ident.use.1)
  ident.use.2 <- paste(ident.2, level.use, sep = "_")
  ident.use.2.exists <- ident.use.2 %in% Idents(object = object)
  
  ## ident.use.1 = "0_g1" or "0_g2"; ident.use.2 = "1_g1" or "1_g2"
  ## 分别对 'g1' 计算类群 0 与类群 1 的差异基因; 对 'g2' 计算类群 0 与类群 1 的差异基因
  marker.test[[i]] <- FindMarkers(
    object = object,
    assay = assay,
    slot = slot,
    ident.1 = ident.use.1,
    ident.2 = ident.use.2,
    verbose = verbose
  )
  names(x = marker.test)[i] <- levels.split[i]
}
## 过滤 NULL 值
marker.test <- Filter(f = Negate(f = is.null), x = marker.test)
## 对 'g1' 中类群 0 与类群 1 差异基因item和 'g2' 中类群 0 与类群 1 差异基因item取交集
genes.conserved <- Reduce(
  f = intersect,
  x = lapply(
    X = marker.test,
    FUN = function(x) {
      return(rownames(x = x))
    }
  )
)

## 将剩下的 logFC, p_val 整理进去
markers.conserved <- list()
for (i in 1:length(x = marker.test)) {
  markers.conserved[[i]] <- marker.test[[i]][genes.conserved, ]
  colnames(x = markers.conserved[[i]]) <- paste(
    names(x = marker.test)[i],
    colnames(x = markers.conserved[[i]]),
    sep = "_"
  )
}

## 合并对 'g1' 中类群 0 与类群 1 差异基因item和 'g2' 中类群 0 与类群 1 差异基因item取交集后的表格
## 包括 logFC, p_val
markers.combined <- Reduce(cbind, markers.conserved)
pval.codes <- colnames(x = markers.combined)[grepl(pattern = "*_p_val$", x = colnames(x = markers.combined))]
if (length(x = pval.codes) > 1) {
  markers.combined$max_pval <- apply(
    X = markers.combined[, pval.codes, drop = FALSE],
    MARGIN = 1,
    FUN = max
  )
  combined.pval <- data.frame(cp = apply(
    X = markers.combined[, pval.codes, drop = FALSE],
    MARGIN = 1,
    FUN = function(x) {
      return(meta.method(x)$p)
    }
  ))
  meta.method.name <- as.character(x = formals()$meta.method)
  colnames(x = combined.pval) <- paste0(meta.method.name, "_p_val")
  markers.combined <- cbind(markers.combined, combined.pval)
  markers.combined <- markers.combined[order(markers.combined[, paste0(meta.method.name, "_p_val")]), ]
}

所谓的 FindConservedMarkers 其实是在单细胞分析中，有两个处理组 'g1' 和 'g2' ，而 'g1' 和 'g2' 组中又恰好有相同的细胞亚群 0 和 1，求在 'g1' 中细胞亚群 0 和细胞亚群 1 的差异基因，并且与在 'g2' 中细胞亚群 0 和细胞亚群 1 的差异基因取交集；这部分基因定义为在 'g1' 和 'g2' 组中，细胞亚群 0 和细胞亚群 1 差异保守的基因