【RNA-Seq 实战】六、DEG分析

首先要获取表达矩阵数据:


image.png

然后就导入R开始分析吧!

1 差异分析

airway、DESeq2、edgeR

library(DESeq2)
library(edgeR)
library(limma)
library(airway)

参考代码在这里:


image.png

DESeq_DEG:

group_list <- c('untrt','trt','untrt','trt','untrt','trt','untrt','trt')

suppressMessages(library(DESeq2)) 
(colData <- data.frame(row.names=colnames(exprSet), group_list=group_list) )
dds <- DESeqDataSetFromMatrix(countData = exprSet,
                              colData = colData,
                              design = ~ group_list)
dds <- DESeq(dds)

res <- results(dds, 
               contrast=c("group_list","trt","untrt"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG=as.data.frame(resOrdered)

##去除na
DEG <- na.omit(DEG)
nrDEG=DEG
DEseq_DEG=nrDEG

绘制热图:

## heatmap
library(pheatmap)
choose_gene=head(rownames(DEG),100) ## 50 maybe better
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)
image.png

绘制火山图:

logFC_cutoff <- with(DEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )

# logFC_cutoff=1
DEG$change = as.factor(ifelse(DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) > logFC_cutoff,
                                   ifelse(DEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                    '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                    '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)

library(ggplot2)
g = ggplot(data=DEG, 
           aes(x=log2FoldChange, y=-log10(pvalue), 
               color=change)) +
  geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20)))+
  xlab("log2 fold change") + ylab("-log10 p-value") +
  ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
ggsave(g,filename = paste0(n,'_volcano.png'))
}
image.png

DEG数据表:


image.png

后续参考代码:


image.png
plotDispEsts(dds, main="Dispersion plot")
image.png
png("DESeq2_RAWvsNORM.png",height = 800,width = 800)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(exprSet, col = cols,main="expression value",las=2)
boxplot(exprMatrix_rlog, col = cols,main="expression value",las=2)
hist(as.matrix(exprSet))
hist(exprMatrix_rlog)
image.png

edgeR_DEG:

library(edgeR)
d <- DGEList(counts=exprSet,group=factor(group_list))
d$samples$lib.size <- colSums(d$counts)
d <- calcNormFactors(d)
d$samples

> d$samples
           group lib.size norm.factors
SRR1039508 untrt 20637971    1.0553567
SRR1039509   trt 18809481    1.0291947
SRR1039512 untrt 25348649    0.9832690
SRR1039513   trt 15163415    0.9490081
SRR1039516 untrt 24448408    1.0255871
SRR1039517   trt 30818215    0.9729022
SRR1039520 untrt 19126151    1.0308785
SRR1039521   trt 21164133    0.9592055

dge=d
design <- model.matrix(~0+factor(group_list))
rownames(design)<-colnames(dge)
colnames(design)<-levels(factor(group_list))

dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
lrt <- glmLRT(fit,  contrast=c(-1,1)) 
nrDEG=topTags(lrt, n=nrow(dge))
nrDEG=as.data.frame(nrDEG)
head(nrDEG)

> head(nrDEG)
                    logFC   logCPM       LR       PValue          FDR
ENSG00000152583 -4.584636 5.536759 380.3237 1.058022e-84 6.782133e-80
ENSG00000109906 -7.138645 4.164218 246.8435 1.266462e-55 4.059139e-51
ENSG00000179094 -3.167552 5.177666 203.9023 2.939683e-46 6.281319e-42
ENSG00000189221 -3.288682 6.769370 186.6373 1.723330e-42 2.761722e-38
ENSG00000101347 -3.842554 9.207551 185.3601 3.274735e-42 4.198341e-38
ENSG00000096060 -3.921181 6.899072 173.0156 1.623902e-39 1.734923e-35

edgeR_DEG =nrDEG 

DEG_limma_voom:

suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design

dge <- DGEList(counts=exprSet)
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log=TRUE, prior.count=3)

v <- voom(dge,design,plot=TRUE, normalize="quantile")
fit <- lmFit(v, design)##运行后画了下图

group_list
cont.matrix=makeContrasts(contrasts=c('untrt-trt'),levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)

tempOutput = topTable(fit2, coef='untrt-trt', n=Inf)
DEG_limma_voom = na.omit(tempOutput)
image.png

2 分析结果间比较

image.png
a1 <- data.frame(gene=rownames(DEG_limma_voom),
                 limma=DEG_limma_voom$logFC
)

a2 <- data.frame(gene=rownames(DEseq_DEG),
                 limma=DEseq_DEG$log2FoldChange
)

a3 <- data.frame(gene=rownames(edgeR_DEG),
                  limma=edgeR_DEG$logFC
)

然后把a1、a2、a3的数据进行粗略比较,用plot看一看是否呈线性,即不同方法得到的分析结果,是否对同一基因有明显差异。

比如,对limma和DEseq结果比较

tmp=merge(a1,a2,by='gene')
plot(tmp[,2:3])
image.png

结果重合度还是较高的。

至此,生信技能树的RNA-Seq教程结束。

当然,还有很多内容没有涉及,比如富集分析的实战。

我会参考简书的教程继续学习。

那么,期待下一个专题学习相见!

这里是佳奥,我们下一篇再见!

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容