DESeq2+clusterProfiler使用
读取featureCounts的reads统计结果,存储与countData矩阵中
library(ggplot2)
library(DESeq2)
library(RColorBrewer)
library(pheatmap)
require(AnnotationHub)
require(clusterProfiler)
data<-read.table("./read_counts.out", #读取featurecounts的计算结果
quote = "\t",skip = 1,header = TRUE)#如果第一行或者前几行不要的话,可以用skip参数
countData<-as.matrix(data[7:12]) # data的第7-12列为6个样品的reads 数信息,将其读入一个新的矩阵
sampleNames<-c("mut_1","mut_2","mut_3","WT_1","WT_2","WT_3") #将不同样品的名称读入一个数组,与featurecount结果一一对应
colnames(countData)=sampleNames #设置表达量矩阵的列名
rownames(countData)<-data$Geneid #设置表达量矩阵的行名
设置样品信息列表
#设置样品信息ColData
#样本信息colData,每一行对应一个样本,行名与countData的样本顺序一一对应,列为各种分组信息
# name condition
#mut_1 mut_1 mut
#mut_2 mut_2 mut
#mut_3 mut_3 mut
#WT_1 WT_1 WT
#WT_2 WT_2 WT
#WT_3 WT_3 WT
database<-data.frame(name=sampleNames, condition=factor(c("mut","mut","mut","WT","WT","WT")))
rownames(database)<-sampleNames
#创建一个DESeq2Dataset对象
dds<-DESeqDataSetFromMatrix(countData,colData = database,design = ~condition)
#差异表达分析
dds<-DESeq(dds)
#该步骤可以分步进行
# dds<- estimateSizeFactors(dds)
# dds <- estimateDispersions(dds)
# dds <- nbinomWaldTest(dds)
计算不同样品间的重复一致性
ddCor<-cor(countData, method="pearson")
pheatmap(ddCor)
ddCor_g<-pheatmap(ddCor,display_numbers = TRUE)
ggsave("Pearson_correlation_coefficient.png", ddCor_g,width = 16, height = 9, units = "in", dpi = 600)
计算不同样品间的距离
rld <- rlog(object=dds,blind=F)
sampleDist <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDist) #样品间距离的矩阵
rownames(sampleDistMatrix) <- paste0(rld$condition,"-", c(1,2,3))
colnames(sampleDistMatrix) <- rownames(sampleDistMatrix)
head(sampleDistMatrix) #样品间距离的数据
colors <- colorRampPalette(rev(brewer.pal(9,"Spectral")))(255)
Distance_among_samples<-pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDist,
clustering_distance_cols=sampleDist,
color = colors)
ggsave("Distance_among_samples.png",Distance_among_samples, width = 16, height = 9, units = "in", dpi = 600)
进行PCA分析
vsd <- vst(object=dds,blind=T)
mdsdata <- data.frame(cmdscale(sampleDistMatrix))
#cmdscale(classical multidimensional scaling)
mdsdata #返回MDS的坐标
mds <- cbind(mdsdata,as.data.frame(colData(vsd)))
mds #按列合并
g_PCA=ggplot(data=mds,aes(X1,X2,color=condition)) +geom_point(size=3)+xlab("PC1") +ylab("PC2")
ggsave("PCA.png",g_PCA, width = 16, height = 9, units = "in", dpi = 600)
获得差异表达分析结果
result<-results(dds,contrast = c("condition","mut","WT")) #比较两组,后面的是对照
summary(result)# 结果分析
DESeq 差异表达基因分析及绘图
DEG<-na.omit(result) #去除表达量中的NA值
data_DEG<-as.data.frame(DEG) #将表达量矩阵转化为数据框
data_DEG$change = as.factor(ifelse(data_DEG$padj < 0.01 & abs(data_DEG$log2FoldChange) > 1,ifelse(data_DEG$log2FoldChange > 1 ,'UP','DOWN'),'STABLE'))
this_tile <- paste0('Cutoff for log2FC is ',"1",
'\nThe number of up gene is ',nrow(data_DEG[data_DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[data_DEG$change =='DOWN',]))
g <- ggplot(data=data_DEG, aes(x=log2FoldChange, y=-log10(padj),color=change))+
geom_point(shape = 16, size=2)+
theme_set(theme_set(theme_bw(base_size=20))) +
xlab("log2 fold change") +
ylab("-log10 p-value") +theme(plot.title = element_text(size=15,hjust = 0.5)) +
theme_classic()+
scale_colour_manual(values = c('green','grey','red'))+ggtitle(this_tile)+geom_vline(xintercept = c(-1, 1), lty = 3, color = 'black') + #添加阈值线
geom_hline(yintercept = 2, lty = 3, color = 'black')
#xlim(-12, 12) #+ ylim(0, 35) #定义刻度边界
#保存火山图为pdf文件
ggsave("Volcano_Plots.pdf",g,
useDingbats = FALSE, width = 16, height = 9, units = "in", dpi = 600)
ggsave("Volcano_Plots.png",g,width = 16, height = 9, units = "in", dpi = 600)
#也可用其他函数
#pdf(“Volcano_Plots.pdf", bg = "white")
#g
#dev.off()
#保存火山图为png格式图片
#png(“Volcano_Plots.png", bg = "white")
#g
#dev.off()
输出上调及下调表达基因
#设置差异表达的阈值为,padj<0.01,上调或下调2倍以上
up_diff_gene_deseq2 <-subset(data_DEG, padj < 0.01 & log2FoldChange > 1)
write.csv(up_diff_gene_deseq2,file= "up_DESeq2_diffExpression.csv")
down_diff_gene_deseq2 <-subset(data_DEG, padj < 0.01 & log2FoldChange < -1)
write.csv(down_diff_gene_deseq2,file= "down_DESeq2_diffExpression.csv")
gene_deseq2 <-subset(data_DEG, padj < 0.01 & abs(log2FoldChange) > 1)
write.csv(gene_deseq2,file= "All_DESeq2_diffExpression.csv")
GO及KEGG分析
hub<-AnnotationHub::AnnotationHub()
query(hub, "zea")
maize<-hub[["AH93894"]]
# 将基因编号转化为ENTREZID,v3版本,其余版本需要自己进行blast2GO,构建数据库信息
data_gene_for_GO<-bitr(rownames(gene_deseq2),fromType = "ALIAS",toType = "ENTREZID",OrgDb = maize)
#进行Go enrichment 分析
#生物过程(Biological Process)
erich.go.BP <- enrichGO(gene=data_gene_for_GO$ENTREZID,OrgDb = maize,pvalueCutoff = 0.01,qvalueCutoff = 0.01,readable = T,ont = "BP")
g_BP<-barplot(erich.go.BP,showCategory = 15)
ggsave("GO_BP.png",g_BP, width = 16, height = 9, units = "in", dpi = 600)
# 分子功能(Molecular Function)
erich.go.mf <- enrichGO(gene=data_gene_for_GO$ENTREZID,OrgDb = maize,pvalueCutoff = 0.01,qvalueCutoff = 0.01,readable = T,ont = "MF")
g_MF<-barplot(erich.go.mf,showCategory = 15)
ggsave("GO_MF.png",g_MF, width = 16, height = 9, units = "in", dpi = 600)
# 细胞组成(Cellular Component)
erich.go.cc <- enrichGO(gene=data_gene_for_GO$ENTREZID,OrgDb = maize,pvalueCutoff = 0.01,qvalueCutoff = 0.01,readable = T,ont = "CC")
g_CC<-barplot(erich.go.cc,showCategory = 15)
ggsave("GO_CC.png",g_CC, width = 16, height = 9, units = "in", dpi = 600)
# 进行KEGG分析
ekk<-enrichKEGG(gene = data_gene_for_GO$ENTREZID, organism = "zma", pAdjustMethod = "BH", pvalueCutoff = 0.01)
g_ekk<-dotplot(ekk)
ggsave("KEGG.png", g_ekk,width = 16, height = 9, units = "in", dpi = 600)