方法选择
做差异分析需要的数据:表达矩阵和分组信息
当表达矩阵为counts时,优先使用edgeR;当只有TPM时,TPM的分布接近芯片数据,使用limma;FPKM做差异分析,不推荐!
limma
使用limma对TPM表达矩阵进行差异分析并绘制火山图:
TPM <- readRDS('./data/TPM.RDS')
meta_data <- readRDS('./data/meta_data.RDS')
exp <- TPM
exp <- exp[apply(exp,1,mean)>0,]
#impute missing expression data
mat <- impute.knn(as.matrix(exp))
rt <- mat$data
rt <- avereps(rt)
qx <- as.numeric(quantile(rt, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) || (qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) { rt[which(rt <= 0)] <- NaN
rt <- log2(rt) }
feature <- 'qixuxinlie2'
stage=1
test_idx <- which(meta_data[[feature]] == stage)
control_idx <- which(meta_data[[feature]] != stage)
test_num <- length(test_idx) #实验组样本数量
control_num <- length(control_idx) #对照组样本数量
class <- c(rep("dis",test_num),rep("con",control_num))
design <- model.matrix(~0+factor(class))
colnames(design) <- c("con","dis")
fit <- lmFit(rt[,c(test_idx,control_idx)],design)
cont.matrix<-makeContrasts(dis-con,levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
allDiff <- topTable(fit2,adjust='BH',number=200000)
#volcano
allDiff$threshold <- 'non'
allDiff$threshold[allDiff$adj.P.Val < 0.05 & allDiff$logFC > 1 ] = "up"
allDiff$threshold[allDiff$adj.P.Val < 0.05 & allDiff$logFC < -1 ] = "down"
allDiff$threshold <- factor(allDiff$threshold,levels = c('up','down','non'))
allDiff$gene <- rownames(allDiff)
allDiff$gene[is.na(allDiff$threshold)] <- ''
diffSig <- allDiff[which(abs(allDiff$logFC) > 1 & allDiff$adj.P.Val < 0.05), ]
ggplot(allDiff,aes(x=logFC,y=-log10(adj.P.Val),colour = threshold))+
xlab("log2 Fold Change")+
ylab("-log10Pvalue") +
geom_point(size=1,alpha=0.6)+
# geom_text(aes(label = gene))+
scale_color_manual(breaks = c('up','down','non'),values = c("#BC3C28","#0072B5","grey")) +
geom_hline(aes(yintercept=-log10(0.05)),colour="grey",size=1 ,linetype=2) + #增加水平间隔线
geom_vline(aes(xintercept=0), colour="grey",size=1 ,linetype=2) + #增加垂直间隔线
theme_few() +
theme(legend.title = element_blank()) + #去掉网格背景和图注标签
ggtitle(paste0('Volcano Plot of ',feature,'=',stage,'\n',
length(which(allDiff == 'up')),' genes up','\n',
length(which(allDiff == 'down')),' genes down'))