setwd("D:\\R data\\RNA-seq")
count_data<-read.table("read.counts.txt",header = T,skip=1)
-
read.counts由STAR-fearureCounts流程得到,结构如下:
图片.png
sample_names<-paste("SRR35899",59:62,sep = "")
colnames(count_data)[7:10]<-sample_names
library(tidyverse)
count_data2<-count_data %>% separate(Geneid,into = c("gene_id","omit"),remove = T) %>% .[,-2]
count_data2<-count_data2[,-(2:6)]
rownames(count_data2)<-count_data2[,1]
count_data2<-count_data2[,-1]
-
数据整理后如下:
图片.png
group<-rep(c("Control","shAkap95"),2)
replicate<-c("Rep1","Rep1","Rep2","Rep2")
metadata<-data.frame(group,replicate,sample_names)
rownames(metadata)<-metadata[[3]]
metadata<-metadata[match(colnames(count_data2),metadata$sample_names),]
图片.png
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("DESeq2")
BiocManager::install("org.Mm.eg.db")
BiocManager::install("pheatmap")
BiocManager::install("RColorBrewer")
BiocManager::install("genefilter")
BiocManager::install("GO.db")
BiocManager::install("clusterProfiler")
BiocManager::install("biomaRt")
BiocManager::install("DOSE")
BiocManager::install("KEGG.db")
BiocManager::install("gage")
library(DESeq2)
# 要把数据变为整数
ddsMat<-DESeqDataSetFromMatrix(countData=count_data2,colData=metadata, design=~group)
ddsMat<-DESeq(ddsMat)
# get results from the testing with fdr ajust p value
results<-results(ddsMat,pAdjustMethod = "fdr",alpha = 0.05)
summary(results)
mcols(results,use.names = T)[,2]
sumary(results)
图片.png
mcols(mcols(results,use.names = T)[,2]
图片.png
### annotate gene symbols
library(org.Mm.eg.db)
results$Gene_symbol<-mapIds(x=org.Mm.eg.db,
keys = row.names(results), ##待转化IDs的列
column = "SYMBOL", ## 要转换为的ID类型
keytype = "ENSEMBL",## 待转化IDs的类型
multiVals = "first")
results$Gene_Name<-mapIds(x=org.Mm.eg.db,
keys = row.names(results), ##待转化IDs的列
column = "SYMBOL", ## 要转换为的ID类型
keytype = "ENSEMBL",## 待转化IDs的类型
multiVals = "first")
## subset for only significant genes (q<0.05)
results_sig<- subset(results,padj<0.05)
head(results_sig)
mapIds()函数与select()函数功能相似,但它可以对数据中出现的重复元素进行清理
mapIds与select函数在参数上的不同之处在于:
- columns 参数,只能选择一个column
- keytype 参数必须指定
- mapIds 包含multiVals函数用于清理重复元素
head(results_sig)
图片.png
#write normalized gene_counts to a .txt file
write.table(x=as.data.frame(counts(ddsMat),normalized=T),
file="normalized_counts.txt",
sep = "\t",
quote = F,
col.names = NA)
##write significant normalized gene_counts to a .txt file
write.table(x=counts(ddsMat[row.names(results_sig)],normalized=T),
file="normalized_counts_significant.txt",
sep = "\t",
quote = F,
col.names = NA)
##write annotated results table to a .txt file
write.table(x=as.data.frame(results),
file="results_gene_annotated.txt",
sep = "\t",
quote = F,
col.names = NA)
##write significant annotated results to a .txt file
write.table(x=as.data.frame(results_sig),
file="results_gene_annotated_significant.txt",
sep = "\t",
quote = F,
col.names = NA)
# plot PCA
library(ggplot2)
## convert all samoles to rlog
ddsMat_rlog<-rlog(ddsMat,blind = F)
## plot PCA by column variable
plotPCA(ddsMat_rlog,intgroup="group",ntop=500)+
theme_bw()+
geom_point(size=5)+
scale_y_continuous()+
ggtitle(label = "Principla component Analysis(PCA)",
subtitle = "Top 500 most variable genes")
图片.png
# heatmap
##convert all samoles to rlog
ddsMat_rlog<-rlog(ddsMat,blind=F)
## gather 40 significant genes to make matrix
mat<-assay(ddsMat_rlog[row.names(results_sig)])[1:40,]
rownames(mat)<-results_sig$Gene_symbol[1:40]
## Choose which column variable you want to annotate the coulumns by
annotation_col<-data.frame(
group=factor(colData(ddsMat_rlog)$group),
replicate=factor(colData(ddsMat_rlog)$replicate),
row.names = colData(ddsMat_rlog)$sample_names
)
##Specify the colors you want annotate the coulums by
ann_colors<-list(
group=c(Control="lightblue",shAkap95="darkorange"),
replicate=c(Rep1="darkred",Rep2="forestgreen")
)
library(pheatmap)
pheatmap(mat=mat,
color = colorRampPalette(brewer.pal(9,"YlOrBr"))(255),
scale="row", # scale genes to z-score (how many standard debiations)
annotation_colors = ann_colors,
annotation_col = annotation_col,
fontsize = 6.5,
cellwidth = 55,
show_colnames = F)
heatmap
图片.png
# volcano plot
data<-data.frame(gene=rownames(results),
pval=-log10(results$padj),
lfc=results$log2FoldChange)
data<-na.omit(data)
data$cat<-ifelse((data$lfc>1 & data$pval>2),"up",ifelse((data$lfc<(-1)&data$pval>2),"down","non_sig"))
gene_labels<- data %>% arrange(-pval,-(abs(lfc))) %>% .[,1] %>%.[1:20]
data<- arrange(data,-pval,-(abs(lfc)))
gene_labels<-mapIds(x=org.Mm.eg.db,
keys = gene_labels,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first")
data$label<-c(label_gene_symbol,rep(NA,(nrow(data)-20)))
DEGS<-data
library(ggplot2)
library(ggrepel)
## 绘制渐变火山图
p<-ggplot(DEGS,aes(x=lfc,y=pval))+
geom_point(aes(size=pval,color=pval))+
scale_color_gradientn(values =seq(0,1,0.2),colors=c("#39489f","#39bbec","#f9ed36","#f38466","#b81f25"))+
scale_size_continuous(range = c(1,3))+
geom_hline(yintercept = -log10(0.01),linetype="dashed",color="#999999")+
geom_vline(xintercept = c(-1.2,1.2),linetype="dashed",color="#999999")+
theme_bw()+
theme(panel.grid = element_blank(), ##去除底纹
legend.position=c(0.01,0.7),
legend.justification =c(0,1)
)+
guides(col=guide_colorbar(title = '-Log_q_value'),
size="none")+
xlab("log2FC")+
ylab("-log10(FDR q-value")
p+ggrepel::geom_label_repel(aes(label=label,color=pval),DEGS,size=3,box.padding = unit(0.25,"lines")) ##或者用gerepel包添加
ggsave("volcano_plot.pdf",width = 10,height = 9)
图片.png
## 查看某个基因在中的表达
for (i in 1:10){
plotCounts(ddsMat,gene= data$gene[i],intgroup = "group",normalized = T,transform = T)
}
图片.png
#DEGs 富集分析
results_sig_gene_symbol<-subset(results_sig,is.na(Gene_symbol)==F) ##去除NA
gene_matrix<-results_sig_gene_symbol$log2FoldChange
names(gene_matrix)<-results_sig_gene_symbol$Gene_symbol
KEGG_gens_symbol<-names(gene_matrix)
KEGG_gene_entrez<-mapIds(x=org.Mm.eg.db,
keys = KEGG_gens_symbol,
column = "ENTREZID",
keytype = "SYMBOL",
multiVals = "first")
multiVals = "first")
## enrichKEGG 中的基因必须是ENtrez ID,所以上面做了转换、
library(clusterProfiler)
Kegg_enrich<-enrichKEGG(gene=KEGG_gene_entrez,
organism = "mouse",
pvalueCutoff = 0.05,
qvalueCutoff = 0.1)
barplot(Kegg_enrich,drop=T,showCategory = 10,title = "KEGG pathay",font.size = 8,color="qvalue",)
图片.png
go_rich<-enrichGO(gene=KEGG_gene_entrez,
OrgDb = "org.Mm.eg.db",
readable = T,
ont = "BP",
pvalueCutoff = 0.05,
qvalueCutoff = 0.1)
barplot(go_rich,
drop=T,
showCategory=10,
title="GO biological pathway",
font.size=8)
## plot KEGG
names(gene_matrix)<-KEGG_gene_entrez
BiocManager::install("pathview")
library(pathview)
pathview(gene.data = gene_matrix,pathway.id = "05165",species = "mouse") ##图篇直接保存在工作路径
mmu05165.pathview.png