2021-12-14 DESeq差异分析以及后续可视化

setwd("D:\\R data\\RNA-seq")
count_data<-read.table("read.counts.txt",header = T,skip=1)

  • read.counts由STAR-fearureCounts流程得到,结构如下:


    图片.png
sample_names<-paste("SRR35899",59:62,sep = "")
colnames(count_data)[7:10]<-sample_names
library(tidyverse)
count_data2<-count_data %>% separate(Geneid,into = c("gene_id","omit"),remove = T) %>% .[,-2]
count_data2<-count_data2[,-(2:6)]  
rownames(count_data2)<-count_data2[,1]
count_data2<-count_data2[,-1]
  • 数据整理后如下:


    图片.png
group<-rep(c("Control","shAkap95"),2)
replicate<-c("Rep1","Rep1","Rep2","Rep2")
metadata<-data.frame(group,replicate,sample_names)
rownames(metadata)<-metadata[[3]]
metadata<-metadata[match(colnames(count_data2),metadata$sample_names),]

图片.png
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("DESeq2")
BiocManager::install("org.Mm.eg.db")
BiocManager::install("pheatmap")
BiocManager::install("RColorBrewer")
BiocManager::install("genefilter")
BiocManager::install("GO.db")
BiocManager::install("clusterProfiler")
BiocManager::install("biomaRt")
BiocManager::install("DOSE")
BiocManager::install("KEGG.db")
BiocManager::install("gage")

library(DESeq2)
# 要把数据变为整数
ddsMat<-DESeqDataSetFromMatrix(countData=count_data2,colData=metadata, design=~group)
ddsMat<-DESeq(ddsMat)
# get results from the testing with fdr ajust p value
results<-results(ddsMat,pAdjustMethod = "fdr",alpha = 0.05)
summary(results)
mcols(results,use.names = T)[,2]

sumary(results)


图片.png

mcols(mcols(results,use.names = T)[,2]


图片.png
### annotate gene symbols
library(org.Mm.eg.db)
results$Gene_symbol<-mapIds(x=org.Mm.eg.db,
                            keys = row.names(results),  ##待转化IDs的列
                            column = "SYMBOL", ## 要转换为的ID类型
                            keytype = "ENSEMBL",## 待转化IDs的类型
                            multiVals = "first")
results$Gene_Name<-mapIds(x=org.Mm.eg.db,
                            keys = row.names(results),  ##待转化IDs的列
                            column = "SYMBOL", ## 要转换为的ID类型
                            keytype = "ENSEMBL",## 待转化IDs的类型
                            multiVals = "first")

## subset for only significant genes (q<0.05)
results_sig<- subset(results,padj<0.05)
head(results_sig)

mapIds()函数与select()函数功能相似,但它可以对数据中出现的重复元素进行清理
mapIds与select函数在参数上的不同之处在于:

  • columns 参数,只能选择一个column
  • keytype 参数必须指定
  • mapIds 包含multiVals函数用于清理重复元素

head(results_sig)


图片.png
#write normalized gene_counts to a .txt file
write.table(x=as.data.frame(counts(ddsMat),normalized=T),
            file="normalized_counts.txt",
            sep = "\t",
            quote = F,
            col.names = NA)
##write significant normalized gene_counts to a .txt file
write.table(x=counts(ddsMat[row.names(results_sig)],normalized=T),
            file="normalized_counts_significant.txt",
            sep = "\t",
            quote = F,
            col.names = NA)
##write annotated results table to a .txt file
write.table(x=as.data.frame(results),
                  file="results_gene_annotated.txt",
                  sep = "\t",
                  quote = F,
                  col.names = NA)
##write significant annotated results to a .txt file
write.table(x=as.data.frame(results_sig),
            file="results_gene_annotated_significant.txt",
            sep = "\t",
            quote = F,
            col.names = NA)
# plot PCA
library(ggplot2)
  ## convert all samoles to rlog
ddsMat_rlog<-rlog(ddsMat,blind = F)
  ## plot PCA by column variable
plotPCA(ddsMat_rlog,intgroup="group",ntop=500)+
  theme_bw()+
  geom_point(size=5)+ 
  scale_y_continuous()+
  ggtitle(label = "Principla component Analysis(PCA)",
          subtitle = "Top 500 most variable genes")
图片.png
# heatmap
  ##convert all samoles to rlog
ddsMat_rlog<-rlog(ddsMat,blind=F)
  ## gather 40 significant genes to make matrix
mat<-assay(ddsMat_rlog[row.names(results_sig)])[1:40,]
rownames(mat)<-results_sig$Gene_symbol[1:40]
 ## Choose which column variable you want to annotate the coulumns by
annotation_col<-data.frame(
  group=factor(colData(ddsMat_rlog)$group),
  replicate=factor(colData(ddsMat_rlog)$replicate),
  row.names = colData(ddsMat_rlog)$sample_names
)

  ##Specify the colors you want annotate the coulums by
ann_colors<-list(
  group=c(Control="lightblue",shAkap95="darkorange"),
  replicate=c(Rep1="darkred",Rep2="forestgreen")
)
library(pheatmap)
pheatmap(mat=mat,
         color = colorRampPalette(brewer.pal(9,"YlOrBr"))(255),
         scale="row", # scale genes to z-score (how many standard debiations)
         annotation_colors = ann_colors,
         annotation_col = annotation_col,
         fontsize = 6.5,
         cellwidth = 55,
         show_colnames = F)

heatmap


图片.png
# volcano plot
data<-data.frame(gene=rownames(results),
                 pval=-log10(results$padj),
                 lfc=results$log2FoldChange)
data<-na.omit(data)

data$cat<-ifelse((data$lfc>1 & data$pval>2),"up",ifelse((data$lfc<(-1)&data$pval>2),"down","non_sig"))

gene_labels<- data %>% arrange(-pval,-(abs(lfc))) %>% .[,1] %>%.[1:20]
data<- arrange(data,-pval,-(abs(lfc)))


gene_labels<-mapIds(x=org.Mm.eg.db,
         keys = gene_labels,
         column = "SYMBOL",
         keytype = "ENSEMBL",
         multiVals = "first")
data$label<-c(label_gene_symbol,rep(NA,(nrow(data)-20)))

DEGS<-data
library(ggplot2)
library(ggrepel)
## 绘制渐变火山图
p<-ggplot(DEGS,aes(x=lfc,y=pval))+
  geom_point(aes(size=pval,color=pval))+
  scale_color_gradientn(values =seq(0,1,0.2),colors=c("#39489f","#39bbec","#f9ed36","#f38466","#b81f25"))+
  scale_size_continuous(range = c(1,3))+
  geom_hline(yintercept = -log10(0.01),linetype="dashed",color="#999999")+
  geom_vline(xintercept = c(-1.2,1.2),linetype="dashed",color="#999999")+
  theme_bw()+
  theme(panel.grid = element_blank(), ##去除底纹
        legend.position=c(0.01,0.7),
        legend.justification =c(0,1) 
        )+
  guides(col=guide_colorbar(title = '-Log_q_value'),
         size="none")+
  xlab("log2FC")+
  ylab("-log10(FDR q-value")
p+ggrepel::geom_label_repel(aes(label=label,color=pval),DEGS,size=3,box.padding = unit(0.25,"lines")) ##或者用gerepel包添加
ggsave("volcano_plot.pdf",width = 10,height = 9)  

图片.png
## 查看某个基因在中的表达
for (i in 1:10){
plotCounts(ddsMat,gene= data$gene[i],intgroup = "group",normalized = T,transform = T)
}
图片.png
#DEGs 富集分析
results_sig_gene_symbol<-subset(results_sig,is.na(Gene_symbol)==F) ##去除NA
gene_matrix<-results_sig_gene_symbol$log2FoldChange
names(gene_matrix)<-results_sig_gene_symbol$Gene_symbol
KEGG_gens_symbol<-names(gene_matrix)
KEGG_gene_entrez<-mapIds(x=org.Mm.eg.db,
                    keys = KEGG_gens_symbol,
                    column = "ENTREZID",
                    keytype = "SYMBOL",
                    multiVals = "first")
                      multiVals = "first")
  ## enrichKEGG 中的基因必须是ENtrez ID,所以上面做了转换、
library(clusterProfiler)
Kegg_enrich<-enrichKEGG(gene=KEGG_gene_entrez,
                        organism = "mouse",
                        pvalueCutoff = 0.05,
                        qvalueCutoff = 0.1)
barplot(Kegg_enrich,drop=T,showCategory = 10,title = "KEGG pathay",font.size = 8,color="qvalue",)


图片.png
go_rich<-enrichGO(gene=KEGG_gene_entrez,
                  OrgDb = "org.Mm.eg.db",
                  readable = T,
                  ont = "BP",
                 pvalueCutoff = 0.05,
                 qvalueCutoff = 0.1)
barplot(go_rich,
        drop=T,
        showCategory=10,
        title="GO biological pathway",
        font.size=8)
 ##  plot KEGG
names(gene_matrix)<-KEGG_gene_entrez
BiocManager::install("pathview")
library(pathview)
pathview(gene.data = gene_matrix,pathway.id = "05165",species = "mouse") ##图篇直接保存在工作路径

mmu05165.pathview.png
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容