seurat包中ScaleData()
和NormalizeData()
的区别
以pbmc3k数据为例:
##先加载数据并对数据进行简单的处理
library(dplyr)
library(Seurat)
library(patchwork)
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "pbmc3k_filtered_gene_bc_matrices/filtered_gene_bc_matrices/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
接下来探究NormalizeData和ScaleData对数据的处理:
先看一下原始counts值:
pbmc@assays[["RNA"]]@counts[10:16,1:5]
#接下来会通过展示TNFRSF18值的变化来解释NormalizeData和ScaleData的区别
#首先用NormalizeData处理
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc@assays[["RNA"]]@data[10:16,1:5]
经过NormalizeData
处理之后TNFRSF18的值变成了1.625141。
#那么NormalizeData是对数据进行了怎样的变换呢?
NormalizeData函数是用来去除测序通量差异的,直接使用NormalizeData()和使用下面的代码等价(以TNFRSF18的计算为例)
#TNFRSF18的count数是2
#计算AAACATTGAGCTAC-1(第二列)的总counts数
sum(pbmc@assays[["RNA"]]@counts[,2])
#[out:]4903
log1p(2/4903*10000)
#[out]:1.625141 和NormalizeData()算出来的结果一样!!!
#然后ScaleData对数据的处理方法
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
#同样先看一下scaledata处理后TNFRSF18值的变化
pbmc@assays[["RNA"]]@scale.data[10:16,1:5]
# 变成了4.86702930!!!
#scaledata默认的是对基因的值进行z-score转换
#统计TNFRSF18在所有细胞中表达均值
mean(pbmc@assays$RNA@data['TNFRSF18',])
#[out]:0.05966781
#统计TNFRSF18在所有细胞中表达标准差
sd(pbmc@assays$RNA@data['TNFRSF18',])
#[out]:0.3216486
(1.625141-0.05966781)/0.3216486 #z-score计算公式
#[out]:4.867029 和scaledata的计算结果一样!!!