DE
比较不同的DE分析工具
bulk RNA-seq,现在最流行的就是DESeq2 和 edgeR。对单细胞测序数据来说,通常需要先聚类之后把细胞群体进行分组,然后来比较不同的组的差异表达情况。跟bulk RNA-seq不一样的地方是,scRNA-seq通常涉及到的样本数量更多。这时候可以使用非参检验算法,比如Kolmogorov-Smirnov test (KS-test) 等等。
scRNAseq数据的模型,常见的有负二项分布的模型。一般UMI的数据拟合这种模型较好。
set.seed(1)
hist(
rnbinom(
1000,
mu = 10,
size = 100),
col = "grey50",
xlab = "Read Counts",
main = "Negative Binomial"
)
但是如果数据的dropout率很高,就不太适合,应该拟合零膨胀的负二项分布模型。
d <- 0.5;
counts <- rnbinom(
1000,
mu = 10,
size = 100
)
counts[runif(1000) < d] <- 0
hist(
counts,
col = "grey50",
xlab = "Read Counts",
main = "Zero-inflated NB"
)
MAST(https://bioconductor.org/packages/release/bioc/html/MAST.html),scde(https://bioconductor.org/packages/release/bioc/html/scde.html)是基于这个模型分析的是基于这个模型分析的)。
library(edgeR)
library(DESeq2)
library(MAST)
library(ROCR)
set.seed(123)
load("D:/paopaoR/normalize/scRNAseq_DEG_input.Rdata")
dim(norm)
## [1] 16026 576
dim(DE)
## [1] 1083 1
dim(notDE)
## [1] 10897 1
table(group)
## group
## NA19098 NA19101 NA19239
## 0 288 288
已知的1083个基因是确定显著差异的,另外10897个基因是确定不显著的(假定这个是金标准)。
DE | notDE | |
---|---|---|
sigDE | tp | fp |
notsigDE | fn | tn |
评价的主要指标:
-True positive rate (TPR), TP/(TP + FN) -TRUE negative rate (TNR), TN/(FP+TN) -ROC -AUC
Kolmogorov-Smirnov test
single cell RNA-seq中,一般样本(细胞)量很多,可以通过检验每个组中表达值的整体分布来识别组间的差异,而不是像批量RNASeq的标准那样仅仅比较平均表达估计值。在统计学中,ks检验基于累计分布函数,用以检验两个经验分布是否不同或一个经验分布与另一个理想分布是否不同,是一种非参数的检验方法。非参数检验一般将观察到的表达式值转换为秩,检验一组的秩分布是否与另一组的秩分布有显著差异。但是当出现大量的相持数据,即tie时,可能表现不好。所以scRNA用ks检验有弊端,首先是它假设基因表达量是连续的,如果有很多细胞表达量一致,比如都是0,表现就很差。其次它对大样本量太敏感了,可能其实差异并不大,但是样本数量很多,也会被认为是显著差异。
pVals <- apply(norm, 1, function(x) {
ks.test(x[group =="NA19101"],
x[group=="NA19239"])$p.value
})
# multiple testing correction
pVals <- p.adjust(pVals, method = "fdr")
sigDE <- names(pVals)[pVals < 0.05]
length(sigDE)
## [1] 5095
sum(GroundTruth$DE %in% sigDE)
## [1] 792
sum(GroundTruth$notDE %in% sigDE)
## [1] 3190
tp <- sum(GroundTruth$DE %in% sigDE)#真阳性
fp <- sum(GroundTruth$notDE %in% sigDE)#假阳性
tn <- sum(GroundTruth$notDE %in% names(pVals)[pVals >= 0.05])#真阴性
fn <- sum(GroundTruth$DE %in% names(pVals)[pVals >= 0.05])#假阴性
tpr <- tp/(tp + fn)#灵敏度
fpr <- fp/(fp + tn)
tnr <- tn/(fp + tn)#特异度
tr <- (tp+tn)/(16026)
cat(c(tpr,tnr,tr))
## 0.7346939 0.7055294 0.5263322
ks_pVals=pVals
pVals <- pVals[names(pVals) %in% GroundTruth$DE |
names(pVals) %in% GroundTruth$notDE]
truth <- rep(1, times = length(pVals));
truth[names(pVals) %in% GroundTruth$DE] = 0;
pred <- ROCR::prediction(pVals, truth)
perf <- ROCR::performance(pred, "tpr", "fpr")
ROCR::plot(perf)
aucObj <- ROCR::performance(pred, "auc")
aucObj@y.values[[1]] # AUC
## [1] 0.7954796
DE_Quality_AUC <- function(pVals) {
pVals <- pVals[names(pVals) %in% GroundTruth$DE |
names(pVals) %in% GroundTruth$notDE]
truth <- rep(1, times = length(pVals));
truth[names(pVals) %in% GroundTruth$DE] = 0;
pred <- ROCR::prediction(pVals, truth)
perf <- ROCR::performance(pred, "tpr", "fpr")
ROCR::plot(perf)
aucObj <- ROCR::performance(pred, "auc")
return(aucObj@y.values[[1]])
}
DE_Quality_rate <- function(sigDE) {
(length(sigDE) )
# Number of KS-DE genes
( sum(GroundTruth$DE %in% sigDE) )
# Number of KS-DE genes that are true DE genes
(sum(GroundTruth$notDE %in% sigDE))
tp <- sum(GroundTruth$DE %in% sigDE)
fp <- sum(GroundTruth$notDE %in% sigDE)
tn <- sum(GroundTruth$notDE %in% names(pVals)[pVals >= 0.05])
fn <- sum(GroundTruth$DE %in% names(pVals)[pVals >= 0.05])
tpr <- tp/(tp + fn)
tnr <- tn/(fp + tn)
tr <- (tp+tn)/(16026)
cat(c(tpr, tnr,tr))
}
wilcoxon test
pVals <- apply(norm, 1, function(x) {
wilcox.test(x[group =="NA19101"],
x[group=="NA19239"])$p.value
})
# multiple testing correction
pVals <- p.adjust(pVals, method = "fdr")
sigDE <- names(pVals)[pVals < 0.05]
Wilcox_pVals=pVals
DE_Quality_rate(sigDE)
## 0.8376623 0.6270654 0.4802196
edgeR
library(edgeR)
dge <- DGEList(counts=counts, norm.factors = rep(1, length(counts[1,])), group=group)
group_edgeR <- factor(group)
design <- model.matrix(~group_edgeR)
dge <- estimateDisp(dge, design = design, trend.method="none")
fit <- glmFit(dge, design)
res <- glmLRT(fit)
pVals <- res$table[,4]
names(pVals) <- rownames(res$table)
pVals <- p.adjust(pVals, method = "fdr")
sigDE <- names(pVals)[pVals < 0.05]
edgeR_pVals=pVals
DE_Quality_rate(sigDE)
## 0.8682746 0.6089726 0.4700487
DE_Quality_AUC(pVals)
## [1] 0.8466764
DESeq2
colData <- data.frame(row.names=colnames(counts), group_list=group)
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = colData,
design = ~ group_list)
## factor levels were dropped which had no samples
dds2 <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 4 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(dds2, contrast=c("group_list","NA19101","NA19239"))
pVals <- res[,6]#padj
names(pVals) <- rownames(res)
sigDE <- names(pVals)[pVals < 0.05]
DE_Quality_rate(sigDE)
## 0.789899 0.6171816 0.4065269
DE_Quality_AUC(pVals)
## [1] 0.8083324
MAST
log_counts <- log(counts + 1) / log(2)
fData <- data.frame(names = rownames(log_counts))
rownames(fData) <- rownames(log_counts);
cData <- data.frame(cond = group)
rownames(cData) <- colnames(log_counts)
obj <- FromMatrix(as.matrix(log_counts), cData, fData)
## `fData` has no primerid. I'll make something up.
## `cData` has no wellKey. I'll make something up.
## Assuming data assay in position 1, with name et is log-transformed.
colData(obj)$cngeneson <- scale(colSums(assay(obj) > 0))
cond <- factor(colData(obj)$cond)
zlmCond <- zlm.SingleCellAssay(~ cond + cngeneson, obj)
## Warning: 'zlm.SingleCellAssay' is deprecated.
## Use 'zlm' instead.
## See help("Deprecated")
## Warning in .nextMethod(object = object, value = value): Coefficients
## condNA19239 are never estimible and will be dropped.
##
## Done!
summaryCond <- summary(zlmCond, doLRT = "condNA19101")
## Combining coefficients and standard errors
## Calculating log-fold changes
## Calculating likelihood ratio tests
## Refitting on reduced model...
##
## Done!
summaryDt <- summaryCond$datatable
summaryDt <- as.data.frame(summaryDt)
pVals <- unlist(summaryDt[summaryDt$component == "H",4]) # H = hurdle model
names(pVals) <- unlist(summaryDt[summaryDt$component == "H",1])
pVals <- p.adjust(pVals, method = "fdr")
sigDE <- names(pVals)[pVals < 0.05]
DE_Quality_rate(sigDE)
## 0.82282 0.6507893 0.4952577
DE_Quality_AUC(pVals)
## [1] 0.8284046
BPSC
基于beta-poisson分布模型。
SCDE
针对单细胞的差异表达分析。
但是这两个方法非常耗时。