前言
上一节,我们通过对组间整体质谱数据分析后,发现NC和PC、PT的组间差异显著,那么具体到单个蛋白质水平的结果又是如何呢。
在单个蛋白质水平上做比较分析,我们可以使用limma包的函数进行分析。除了组间比较外,它还可以对某些可能潜在影响比较结果的因素如性别、年龄等进行校正处理(可以先期对组间临床表型进行组间差异比较,查看哪些指标是组间显著差异的,这些指标可能影响蛋白质的组间差异比较。它们应该作为协变量被校正掉).
除了使用基于线性模型的limma包外,还可以使用常用的t-test等,但默认情况下,我们会认为case和control组的数据都服从同一正态分布,具有相同的方差,这个时候我们会使用student-t test。可实际上,case和control组的分布可能不是同一个方差,我们这个时候应该选择Welch's t-test。
导入数据
library(dplyr)
library(tibble)
library(ggplot2)
library(convert)
library(limma)
# rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 1000 * 1024^2)
subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")
ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")
Differential Expression Analysis
如果数据存在配对情况,则可设置pair=T。
scale参数是针对counts数据设置的。
future.apply是对并行计算设置的。
eBayes的结果中logFC有时候让人很困惑,需要注意该结果是否的正确性,因此设置datCoe判断富集方向,也可以通过内置的pl图发现富集方向。
get_DiffProtein_limma <- function(dataset=ExprSet_LOG,
group_name=subgrp[1:2],
pair=FALSE,
scale=FALSE,
fc=1,
Pval=0.05){
# dataset=ExprSet_LOG
# group_name=subgrp[1:2]
# pair=FALSE
# scale=FALSE
# fc=1
# Pval=0.05
pheno <- pData(dataset) %>%
filter(SubGroup%in%group_name)
pheno$SubGroup <- factor(as.character(pheno$SubGroup), levels = group_name)
pheno$PID <- factor(as.character(pheno$PID))
if(pair){
# paired test
design <- model.matrix(~ pheno$SubGroup + pheno$PID)
rownames(design) <- rownames(pheno)
colnames(design) <- c("Intercept",
paste(group_col, collapse = "-"),
as.character(unique(pheno$PID)[-1]))
}else{
design <- model.matrix( ~ 0 + pheno$SubGroup)
rownames(design) <- rownames(pheno)
colnames(design) <- group_name
}
# show distribution
edata <- as.matrix(exprs(dataset))
exprSet <- edata[, colnames(edata)%in%rownames(pheno)]
boxplot(exprSet)
plotDensities(exprSet)
# Normalization: TMM
if(scale){
require(edgeR)
DGEList <- edgeR::DGEList(
counts = exprSet,
group = pheno$SubGroup)
exprSet_norm <- edgeR::calcNormFactors(DGEList, method = "TMM")
plotMDS(exprSet_norm, col=as.numeric(pheno$SubGroup))
}else{
exprSet_norm <- exprSet
}
# linear fitting
#limma_voom <- voom(exprSet_norm, design, plot = TRUE)
fit <- lmFit(exprSet_norm, design)
if(pair){
# eBayes
fit2 <- eBayes(fit)
qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
abline(0,1)
# differential features
diff_gene <- topTable(fit2, number = Inf, adjust.method = 'BH', coef = 2) %>%
rownames_to_column("GeneID")
# delta
require(future.apply)
plan(multiprocess, workers = 10)
delta_value <- future_apply(exprSet, 1, function(x, y){
# x = exprSet[1, ]
# y = pheno
dat <- data.frame(value=x, y) %>%
arrange(PID, SubGroup) %>%
dplyr::select(PID, SubGroup, value)
dat$SubGroup <- factor(dat$SubGroup, levels = group_name)
dat_delta <- dat %>% group_by(PID, SubGroup) %>%
summarise(mean_value=mean(value)) %>% # mean or median???
mutate(delta=dplyr::first(mean_value) - dplyr::last(mean_value)) %>%
ungroup()
delta <- mean(dat_delta$delta)
return(delta)
}, pheno) %>% data.frame() %>%
setNames("Delta") %>%
rownames_to_column("GeneID")
# stopCluster(cl)
# combine DEG and delta
diff_gene_delta <- inner_join(diff_gene, delta_value, by = "GeneID")
}else{
# contrast group for unpaired test
group <- paste(group_name, collapse = "-")
if(group%in%"NC-PC"){
contrast <- makeContrasts(contrasts = "NC-PC",
levels = design)
}else if(group%in%"NC-PT"){
contrast <- makeContrasts(contrasts = "NC-PT",
levels = design)
}else if(group%in%"PC-PT"){
contrast <- makeContrasts(contrasts = "PC-PT",
levels = design)
}
print(contrast)
# eBayes
fit2 <- contrasts.fit(fit, contrast)
fit2 <- eBayes(fit2)
qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
abline(0,1)
# differential features
diff_gene <- topTable(fit2, number = Inf, adjust.method = 'BH', coef = 1) %>%
rownames_to_column("GeneID")
# delta
require(future.apply)
plan(multiprocess, workers = 10)
delta_value <- future_apply(exprSet, 1, function(x, y){
# x = exprSet[1, ]
# y = pheno
dat <- data.frame(value=x, y) %>%
arrange(SubGroup) %>%
dplyr::select(SubGroup, value)
dat$Type <- factor(dat$SubGroup, levels = group_name)
dat_delta <- dat %>% group_by(SubGroup) %>%
summarise(mean_value=mean(value)) %>% # mean or median???
mutate(delta=dplyr::first(mean_value) - dplyr::last(mean_value)) %>%
ungroup()
delta <- mean(dat_delta$delta)
return(delta)
}, pheno) %>% data.frame() %>%
setNames("Delta") %>%
rownames_to_column("GeneID")
# stopCluster(cl)
# combine DEG and delta
diff_gene_delta <- inner_join(diff_gene, delta_value, by = "GeneID")
}
# validate the enriched directory
pl <- data.frame(edata)[rownames(data.frame(edata))%in%diff_gene_delta$GeneID[1], , F] %>%
t() %>% data.frame() %>%
setNames("Gene") %>%
rownames_to_column("SampleID") %>%
inner_join(pheno%>%rownames_to_column("SampleID"), by = "SampleID") %>%
ggplot(aes(x=SubGroup, y=Gene))+
geom_boxplot()+
labs(y=diff_gene$GeneID[1], x="")+
ggpubr::stat_compare_means(method = "wilcox.test",
paired = pair,
comparisons = list(group_name))+
theme_bw()
print(pl)
# enriched directory: It is sometimes useful to check things by hand to make sure you have the right interpretation.
for(i in 1:5){
datCoe <- fit$coefficients[diff_gene_delta$GeneID[i], ]
deltaMean <- as.numeric(datCoe[group_name[2]] - datCoe[group_name[1]])
logFC <- diff_gene_delta[diff_gene_delta$GeneID%in%diff_gene_delta$GeneID[i], "logFC"]
cat(paste0(diff_gene_delta$GeneID[i], ": ", paste(rev(group_name), collapse = "-"), " = ", signif(deltaMean, 3)))
cat("\n")
cat(paste0(diff_gene_delta$GeneID[i], ": ", "logFC = ", signif(logFC, 3)))
cat("\n")
}
if((deltaMean > 0 & logFC > 0) | (deltaMean < 0 & logFC < 0)){
diff_gene_delta[which(diff_gene_delta$logFC >= fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
diff_gene_delta[which(diff_gene_delta$logFC <= -fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
diff_gene_delta[which(abs(diff_gene_delta$logFC) < fc | diff_gene_delta$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"
}else if((deltaMean > 0 & logFC < 0) | (deltaMean < 0 & logFC > 0)){
diff_gene_delta[which(diff_gene_delta$logFC >= fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
diff_gene_delta[which(diff_gene_delta$logFC <= -fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
diff_gene_delta[which(abs(diff_gene_delta$logFC) < fc | diff_gene_delta$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"
}
# Number & Block
dat_status <- table(pheno$SubGroup)
dat_status_number <- as.numeric(dat_status)
dat_status_name <- names(dat_status)
diff_gene_delta$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
"vs",
paste(dat_status_number[2], dat_status_name[2], sep = "_"))
res <- diff_gene_delta %>% dplyr::select(GeneID, Block, logFC, adj.P.Val, Enrichment, everything()) %>%
arrange(adj.P.Val, logFC)
print(dim(res %>% filter(Enrichment != "Nonsignif")))
return(res)
}
NC_PC_LOG2Impute <- get_DiffProtein_limma(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[1:2],
pair = FALSE,
scale = FALSE,
fc = 1,
Pval = 0.05)
write.csv(NC_PC_LOG2Impute, "NC_PC_limma_Mass_LOG2Impute.csv", row.names = F)
NC_PT_LOG2Impute <- get_DiffProtein_limma(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[c(1,3)],
pair = FALSE,
scale = FALSE,
fc = 1,
Pval = 0.05)
write.csv(NC_PT_LOG2Impute, "NC_PT_limma_Mass_LOG2Impute.csv", row.names = F)
PC_PT_LOG2Impute <- get_DiffProtein_limma(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[c(2, 3)],
pair = FALSE,
scale = FALSE,
fc = 1,
Pval = 0.05)
write.csv(PC_PT_LOG2Impute, "PC_PT_limma_Mass_LOG2Impute.csv", row.names = F)
Welch’s t-test
The independent samples t-test comes in two different forms:
the standard Student’s t-test, which assumes that the variance of the two groups are equal. the Welch's t-test, which is less restrictive compared to the original Student’s test. This is the test where you do not assume that the variance is the same in the two groups, which results in the fractional degrees of freedom.
Welch_ttest <- function(dataset=ExprSet_LOG2Impute,
group_name=subgrp[1:2],
Pval=0.05,
fc=1){
# dataset=ExprSet_LOG2Impute
# group_name=subgrp[1:2]
# Pval=0.05
# fc=1
pheno <- pData(dataset) %>%
filter(SubGroup%in%group_name) %>%
mutate(SubGroup=factor(SubGroup, levels = group_name))
edata <- exprs(dataset)[, rownames(pheno)]
require(rstatix)
Welch_res <- apply(edata, 1, function(x, y){
# x <- edata[1, ]
# y <- pheno$Group
dat <- data.frame(value=x, group=y)
mdn <- tapply(dat$value, dat$group, median) %>%
data.frame() %>% setNames("value") %>%
rownames_to_column("Group")
mdn1 <- with(mdn, mdn[Group%in%group_name[1], "value"])
mdn2 <- with(mdn, mdn[Group%in%group_name[2], "value"])
Log2FC <- log2(mdn2/mdn1)
rest <- t_test(data = dat, value ~ group)
return(c(Log2FC, rest$statistic, rest$p))
}, pheno$SubGroup) %>%
t() %>% data.frame() %>%
setNames(c("logFC", "rho", "P.value"))
res <- Welch_res[!is.nan(Welch_res$logFC), ] %>%
rownames_to_column("GeneID")
res$adj.P.Val <- p.adjust(as.numeric(res$P.value), method = "BH")
# Number & Block
dat_status <- table(pheno$SubGroup)
dat_status_number <- as.numeric(dat_status)
dat_status_name <- names(dat_status)
res$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
"vs",
paste(dat_status_number[2], dat_status_name[2], sep = "_"))
# Enrichment
res[which(res$logFC >= fc & res$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
res[which(res$logFC <= -fc & res$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
res[which(abs(res$logFC) < fc | res$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"
res <- res %>% dplyr::select(GeneID, Block, logFC, adj.P.Val, Enrichment, everything()) %>%
arrange(adj.P.Val, logFC)
return(res)
}
NC_PC_LOG2Impute_WelchT <- Welch_ttest(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[1:2],
fc = 1,
Pval = 0.05)
write.csv(NC_PC_LOG2Impute_WelchT, "NC_PC_WelchT_Mass_LOG2Impute.csv", row.names = F)
NC_PT_LOG2Impute_WelchT <- Welch_ttest(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[c(1,3)],
fc = 1,
Pval = 0.05)
write.csv(NC_PT_LOG2Impute_WelchT, "NC_PT_WelchT_Mass_LOG2Impute.csv", row.names = F)
PC_PT_LOG2Impute_WelchT <- Welch_ttest(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[c(2,3)],
fc = 1,
Pval = 0.05)
write.csv(PC_PT_LOG2Impute_WelchT, "PC_PT_WelchT_Mass_LOG2Impute.csv", row.names = F)
systemic information
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)
Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] rstatix_0.7.0 convert_1.64.0 marray_1.68.0 limma_3.46.0 Biobase_2.50.0 BiocGenerics_0.36.0
[7] ggplot2_3.3.3 tibble_3.1.0 dplyr_1.0.5
loaded via a namespace (and not attached):
[1] sass_0.3.1 edgeR_3.32.1 tidyr_1.1.3 jsonlite_1.7.2 carData_3.0-4 bslib_0.2.4
[7] assertthat_0.2.1 askpass_1.1 cellranger_1.1.0 yaml_2.2.1 pillar_1.6.0 backports_1.2.1
[13] lattice_0.20-41 glue_1.4.2 reticulate_1.18 digest_0.6.27 ggsignif_0.6.0 colorspace_2.0-0
[19] cowplot_1.1.0 htmltools_0.5.1.1 Matrix_1.3-2 plyr_1.8.6 pkgconfig_2.0.3 broom_0.7.6
[25] haven_2.3.1 purrr_0.3.4 scales_1.1.1 RSpectra_0.16-0 openxlsx_4.2.3 rio_0.5.16
[31] openssl_1.4.3 generics_0.1.0 farver_2.1.0 car_3.0-10 ellipsis_0.3.1 ggpubr_0.4.0
[37] withr_2.4.1 umap_0.2.7.0 magrittr_2.0.1 crayon_1.4.1 readxl_1.3.1 evaluate_0.14
[43] fansi_0.4.2 forcats_0.5.0 foreign_0.8-81 tools_4.0.2 data.table_1.14.0 hms_1.0.0
[49] lifecycle_1.0.0 stringr_1.4.0 munsell_0.5.0 locfit_1.5-9.4 zip_2.1.1 jquerylib_0.1.3
[55] compiler_4.0.2 tinytex_0.31 rlang_0.4.10 grid_4.0.2 labeling_0.4.2 rmarkdown_2.7
[61] gtable_0.3.0 abind_1.4-5 DBI_1.1.1 curl_4.3 reshape2_1.4.4 R6_2.5.0
[67] knitr_1.31 utf8_1.2.1 stringi_1.4.6 Rcpp_1.0.6 vctrs_0.3.7 tidyselect_1.1.0
[73] xfun_0.20