上期我们讲了使用Seurat分析单个样本,那么2个或多个样本的分析会有哪些不同呢?单细胞转录组数据中一个重要的问题就在于批次效应,任何一个试验步骤都容易带来批次效应,多个样本的比较分析更甚。Seurat专门开发了一种能够比较多个样本的分析方法,我们一起来看看。
假设有2个样本,分别对2个样本进行数据的预处理及标准化,类似于单个样本的分析:
#样本1
matrix <- Read10X_h5("filtered_gene_bc_matrices_h5.h5")
filter_10x_object <-CreateSeuratObject(raw.data = matrix,min.cells = 3)
filter_10x_object@meta.data$group <- "sample_1"
mito.genes <- grep(pattern = "^MT-", x = rownames(x = filter_10x_object@data),
value = TRUE)
percent.mito <- Matrix::colSums(filter_10x_object@raw.data[mito.genes, ])/
Matrix::colSums(filter_10x_object@raw.data)
filter_10x_object <- AddMetaData(object = filter_10x_object, metadata
= percent.mito,col.name = "percent.mito")
filter_10x_object <- FilterCells(object = filter_10x_object, subset.names =
c("nGene", "percent.mito"), low.thresholds = c(200, -Inf),
high.thresholds = c(2500, 0.05))
filter_10x_object <- NormalizeData(object = filter_10x_object,
normalization.method = "LogNormalize", scale.factor = 10000)
filter_10x_object<- ScaleData(filter_10x_object, vars.to.regress
= c("nUMI", "percent.mito"), display.progress = F)
filter_10x_object <- FindVariableGenes(filter_10x_object, do.plot = F)
g.1 <- head(rownames(filter_10x_object@hvg.info), 1000)
#样本2
matrix2 <- Read10X_h5("filtered_gene_bc_matrices_h5.h5")
filter_10x_object_2<-CreateSeuratObject(raw.data = matrix2,min.cells = 3)
filter_10x_object_2@meta.data$group <- "sample_2"
mito.genes <- grep(pattern = "^MT-", x = rownames(x = filter_10x_object_2@data),
value = TRUE)
percent.mito <- Matrix::colSums(filter_10x_object_2@raw.data[mito.genes, ])/
Matrix::colSums(filter_10x_object_2@raw.data)
filter_10x_object_2 <- AddMetaData(object = filter_10x_object_2, metadata
= percent.mito,col.name = "percent.mito")
filter_10x_object_2<- FilterCells(object = filter_10x_object_2, subset.names =
c("nGene", "percent.mito"), low.thresholds = c(200, -Inf),
high.thresholds = c(2500, 0.05))
filter_10x_object_2<- NormalizeData(object = filter_10x_object_2,
normalization.method = "LogNormalize", scale.factor = 10000)
filter_10x_object_2<- ScaleData(filter_10x_object_2, vars.to.regress
= c("nUMI", "percent.mito"), display.progress = F)
filter_10x_object_2<- FindVariableGenes(filter_10x_object_2, do.plot = F)
g.2 <- head(rownames(filter_10x_object_2@hvg.info), 1000)
#选取2个样本高可变基因的交集作为后续分析的基因
genes.use <- unique(c(g.1, g.2))
genes.use <- intersect(genes.use, rownames(filter_10x_object@scale.data))
genes.use <- intersect(genes.use, rownames(filter_10x_object_2@scale.data))
#执行RunCCA函数,主要目的是鉴定2个数据集共有的变异源,同时也会将2个对象合并成一个对象,
#并进行数据的标准化和归一化(scale)
combined <- RunCCA(filter_10x_object, filter_10x_object_2,
genes.use = genes.use, num.cc = 30,
add.cell.id1="sample_1",add.cell.id2="sample_2")
#如果是2个以上的样本,则需要使用RunMultiCCA函数
#类似于PCA分析的PC选择,执行RunCCA后也需要选择合适的CC个数进行后续分析
p3 <- MetageneBicorPlot(combined, grouping.var = "group", dims.eval = 1:30,
display.progress = FALSE)
执行完上述步骤后进行align the CCA subspaces
combined <- AlignSubspace(combined, reduction.type = "cca",
grouping.var = "group", dims.align = 1:20)
#执行完此步骤后会返回一个新的降维数据用于后续聚类分析
combined <- RunTSNE(combined, reduction.use = "cca.aligned",
dims.use = 1:20, do.fast = T)
combined <- FindClusters(combined, reduction.type = "cca.aligned",
resolution = 0.6, dims.use = 1:20)
#接下去就根据自己的分析需求进行多组数据比较分析了,这里就不介绍了
注:Seurat的多个样本合并分析,与10x官网的cellranger aggr分析相差很大。