首先要获取表达矩阵数据:
image.png
然后就导入R开始分析吧!
1 差异分析
airway、DESeq2、edgeR
library(DESeq2)
library(edgeR)
library(limma)
library(airway)
参考代码在这里:
image.png
DESeq_DEG:
group_list <- c('untrt','trt','untrt','trt','untrt','trt','untrt','trt')
suppressMessages(library(DESeq2))
(colData <- data.frame(row.names=colnames(exprSet), group_list=group_list) )
dds <- DESeqDataSetFromMatrix(countData = exprSet,
colData = colData,
design = ~ group_list)
dds <- DESeq(dds)
res <- results(dds,
contrast=c("group_list","trt","untrt"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG=as.data.frame(resOrdered)
##去除na
DEG <- na.omit(DEG)
nrDEG=DEG
DEseq_DEG=nrDEG
绘制热图:
## heatmap
library(pheatmap)
choose_gene=head(rownames(DEG),100) ## 50 maybe better
choose_matrix=exprSet[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)
image.png
绘制火山图:
logFC_cutoff <- with(DEG,mean(abs( log2FoldChange)) + 2*sd(abs( log2FoldChange)) )
# logFC_cutoff=1
DEG$change = as.factor(ifelse(DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) > logFC_cutoff,
ifelse(DEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT')
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
library(ggplot2)
g = ggplot(data=DEG,
aes(x=log2FoldChange, y=-log10(pvalue),
color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
ggsave(g,filename = paste0(n,'_volcano.png'))
}
image.png
DEG数据表:
image.png
后续参考代码:
image.png
plotDispEsts(dds, main="Dispersion plot")
image.png
png("DESeq2_RAWvsNORM.png",height = 800,width = 800)
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(exprSet, col = cols,main="expression value",las=2)
boxplot(exprMatrix_rlog, col = cols,main="expression value",las=2)
hist(as.matrix(exprSet))
hist(exprMatrix_rlog)
image.png
edgeR_DEG:
library(edgeR)
d <- DGEList(counts=exprSet,group=factor(group_list))
d$samples$lib.size <- colSums(d$counts)
d <- calcNormFactors(d)
d$samples
> d$samples
group lib.size norm.factors
SRR1039508 untrt 20637971 1.0553567
SRR1039509 trt 18809481 1.0291947
SRR1039512 untrt 25348649 0.9832690
SRR1039513 trt 15163415 0.9490081
SRR1039516 untrt 24448408 1.0255871
SRR1039517 trt 30818215 0.9729022
SRR1039520 untrt 19126151 1.0308785
SRR1039521 trt 21164133 0.9592055
dge=d
design <- model.matrix(~0+factor(group_list))
rownames(design)<-colnames(dge)
colnames(design)<-levels(factor(group_list))
dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
lrt <- glmLRT(fit, contrast=c(-1,1))
nrDEG=topTags(lrt, n=nrow(dge))
nrDEG=as.data.frame(nrDEG)
head(nrDEG)
> head(nrDEG)
logFC logCPM LR PValue FDR
ENSG00000152583 -4.584636 5.536759 380.3237 1.058022e-84 6.782133e-80
ENSG00000109906 -7.138645 4.164218 246.8435 1.266462e-55 4.059139e-51
ENSG00000179094 -3.167552 5.177666 203.9023 2.939683e-46 6.281319e-42
ENSG00000189221 -3.288682 6.769370 186.6373 1.723330e-42 2.761722e-38
ENSG00000101347 -3.842554 9.207551 185.3601 3.274735e-42 4.198341e-38
ENSG00000096060 -3.921181 6.899072 173.0156 1.623902e-39 1.734923e-35
edgeR_DEG =nrDEG
DEG_limma_voom:
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
design
dge <- DGEList(counts=exprSet)
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log=TRUE, prior.count=3)
v <- voom(dge,design,plot=TRUE, normalize="quantile")
fit <- lmFit(v, design)##运行后画了下图
group_list
cont.matrix=makeContrasts(contrasts=c('untrt-trt'),levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
tempOutput = topTable(fit2, coef='untrt-trt', n=Inf)
DEG_limma_voom = na.omit(tempOutput)
image.png
2 分析结果间比较
image.png
a1 <- data.frame(gene=rownames(DEG_limma_voom),
limma=DEG_limma_voom$logFC
)
a2 <- data.frame(gene=rownames(DEseq_DEG),
limma=DEseq_DEG$log2FoldChange
)
a3 <- data.frame(gene=rownames(edgeR_DEG),
limma=edgeR_DEG$logFC
)
然后把a1、a2、a3的数据进行粗略比较,用plot看一看是否呈线性,即不同方法得到的分析结果,是否对同一基因有明显差异。
比如,对limma和DEseq结果比较
tmp=merge(a1,a2,by='gene')
plot(tmp[,2:3])
image.png
结果重合度还是较高的。
至此,生信技能树的RNA-Seq教程结束。
当然,还有很多内容没有涉及,比如富集分析的实战。
我会参考简书的教程继续学习。
那么,期待下一个专题学习相见!
这里是佳奥,我们下一篇再见!