DESeq分析
环境: 一份Counts表格,含有各组样品名和基因名
根据指定的两组分析,并画火山图和热图
1. 读取xlsx文件
2. 提取行名,转为数值后再设行名
3. 创建样本分组信息,文件夹文件名
4. 构建对象,进行DESeq分析
5. 提取差异基因并保存txt,csv
6. 画火山图,可单独指定FCcutoff和pvaluecutoff,保存
7. 画热图,注意设定参数limitsize = FALSE,保存为PDF和PNG

Naf_in-vs-PBS_in-volcano.png

Naf_in-vs-PBS_in-heat.png

e86b7f433742e25969268f4949d1905.png
先建立一个函数文件:
Func.R
library(DESeq2)
require(tidyverse)
require(ggplot2)
# require(ggVolcano)
require(pheatmap)
require(RColorBrewer)
require(EnhancedVolcano)
library(openxlsx)
DeseqAnalysis <- function(group1_index,group2_index){
# 根据指定的两组画火山图
# 1. 读取xlsx文件
# 2. 提取行名,转为数值后再设行名
# 3. 创建样本分组信息,文件夹文件名
# 4. 构建对象,进行DESeq分析
# 5. 提取差异基因并保存txt,csv
# 6. 画火山图,可单独指定FCcutoff和pvaluecutoff,保存
#Naf_in PBS_in 为1和4
# 加载 此前处理好的.xlsx 基因Counts文件
df <- read.xlsx("MKX_mouse_result - gene-names2-20250512.xlsx", rowNames = TRUE)
# 提取列名
col_names <- colnames(df)
# 提取行名
row_names <- rownames(df)
# row_names
# 将数据框的主体部分转换为数值类型
df_numeric <- as.data.frame(apply(df, 2, as.numeric))
# df_numeric
# colnames(df_numeric)
# rownames(df_numeric)
# # 重新设置列名
# colnames(df_numeric) <- col_names
# 重新设置行名,生成后续分析的counts_df
rownames(df_numeric) <- row_names
counts_df <-df_numeric
# 创建样本分组信息
sample_names <- colnames(counts_df)
sample_names
# counts_df
#--------两两分析------------
#c("Naf_in","Isa_ig","Ose_ig","PBS_in","PBS_ig","Mock")
#Naf_in vs PBS_in
# 1-4, 2-5, 3-5, 4-6, 5-6, 1-6, 2-6, 3-6
#test
# group1_index = 2
# group2_index = 5
# ---------------1-4---------
# --------------------------------先分析 Naf_in 和 PBS_in组------------------------------
# 第1组和第4组"Naf_in" "PBS_in"
index_group1 <- group1_index
index_group2 <- group2_index
# 样本重复数为3
rep_num <- 3
# 去除后缀,获取组样本名
name_group1 = str_remove(colnames(df)[index_group1*rep_num],"_3$")
name_group2 = str_remove(colnames(df)[index_group2*rep_num],"_3$")
name_group1
name_group2
# 生成文件名
diff_name = paste0(name_group1,"-vs-",name_group2)
# 创建文件夹,两两分析
dir.create(diff_name)
# 创建RDS文件名
f_name = paste0(diff_name,"/",diff_name,".RDS")
f_name
# 创建样本分组信息
# sample_names <- colnames(counts_df)
# sample_names
# counts_df
coldata <- data.frame(condition = factor(rep(c(name_group1,name_group2),each=3)),
levels = c(rep(name_group1,3),rep(name_group2,3)))
coldata
# 选取包含 "name_group1" 或 "name_group2" 的列(并集) Naf_in PBS_in
counts_df_vs <- counts_df %>%
select(contains(name_group1) | contains(name_group2))
# 正式分析---------------DESeq------------------
# 第一步,构建 DESeqDataSet 对象
dds <- DESeqDataSetFromMatrix(countData = counts_df_vs, colData = coldata, design= ~condition)
# ----------------- 关键过滤步骤 -----------------
# 查看过滤效果
cat("过滤前基因数:", nrow(counts(dds, normalized = FALSE)), "\n")
before_gene_num <- nrow(counts(dds, normalized = FALSE))
# 方法1:基于原始counts过滤
# 保留至少在3个样本中count≥10的基因
keep <- rowSums(counts(dds) >= 10) >= 3
dds <- dds[keep, ]
# dds
cat("过滤后基因数:", nrow(dds), "\n")
after_gene_num <- nrow(dds)
# after_gene_num
# rownames(dds)
# rownames(dds1)
#
# 第二步,计算差异倍数并获得 p 值
# dds1 <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = T)
dds1 <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = T)
#注意,需将 treat 在前,control 在后,意为 treat 相较于 control 中哪些基因上调/下调
res <- results(dds1, contrast = c('condition', name_group1, name_group2))
res1 <- data.frame(res, stringsAsFactors = FALSE, check.names = FALSE)
# # 重新设置行名(一定要在排序前)
# rownames(res1) <- row_names
# #排序
# res1 <- res1[order(res1$padj, res1$log2FoldChange, decreasing = c(FALSE, TRUE)), ]#排序
#其余标识 none,代表非差异的基因
# res1[which(abs(res1$log2FoldChange) <= 1 | res1$padj >= 0.05),'sig'] <- 'none'
# res1[which(res1$log2FoldChange >= 0.5 & res1$padj < 0.05),'sig'] <- 'up'
# res1[which(res1$log2FoldChange <= -0.5 & res1$padj < 0.05),'sig'] <- 'down'
# 设置参数cutoff
FC_cutoff = 1.333
P_cutoff = 0.01
res1[which(abs(res1$log2FoldChange) <= FC_cutoff | res1$pvalue >= P_cutoff),'sig'] <- 'none'
res1[which(res1$log2FoldChange >= FC_cutoff & res1$pvalue < P_cutoff),'sig'] <- 'up'
res1[which(res1$log2FoldChange <= -FC_cutoff & res1$pvalue < P_cutoff),'sig'] <- 'down'
#输出选择的差异基因总表
res1_select <- subset(res1, sig %in% c('up', 'down'))
#保存为txt--差异基因
Deg_gene = paste0(diff_name,"/",diff_name,"Deg_gene.txt")
write.table(res1_select, file = Deg_gene, sep = ';', col.names = NA, quote = FALSE)
#保存为csv--所有基因
TwoGroupFileName = paste0(diff_name,"/",diff_name,".csv")
write.csv(res1, file = TwoGroupFileName, row.names = TRUE)
# dds1 <- readRDS("./mock-beta_12h/mock-beta_12h.RDS")
# isg <- read.csv2("./mock-beta_12h/mock-beta_12hDeg_gene.txt",sep=";")
# isg %>%filter(str_detect(rownames(isg),"ISG") )
diff_name
title = paste0(diff_name,'(DESeq2 Results) ')
title
subtitle = paste0("Differential expression (Filtered Genes Num: ",after_gene_num,")")
subtitle
##火山图, 里面确定pCut和FCcut
p_evo <- EnhancedVolcano(res1,
lab = rownames(res1),
x = "log2FoldChange",
# y = "padj",
y = "pvalue",
#selectLab = rownames(lrt)[1:4],
xlab = bquote(~Log[2]~ "Fold Change"),
ylab = bquote(~-Log[10] ~ italic(p-value)),
# pCutoff = 0.001,## pvalue
# FCcutoff = 1,## FC cutoff
pCutoff = 0.0001,## pvalue
FCcutoff = 1.333,## FC cutoff
# xlim = c(-5,5),
# ylim = c(0,7.5),
xlim = c(-6,6),
ylim = c(0,8),
pointSize = 3.5,
labSize =3.5,
title = title,
subtitle = subtitle,
caption = 'FC cutoff, 1.333; p-value cutoff, 10e-4',
# legendPosition = "right",
legendLabSize = 14,
# col = c('grey30', 'forestgreen', 'royalblue', 'red2'),
colAlpha = 0.8,
drawConnectors = TRUE,
lengthConnectors = unit(0.05, "inches"),
# hline = c(10e-8),
widthConnectors = 0.5,
max.overlaps = 50,
maxoverlapsConnectors=50
)
p_evo
ggsave(paste0(diff_name,"/",diff_name,"-volcano.png"),p_evo,height = 10,width = 8)
# 选择差异基因画热图
dfheat <- filter(counts_df_vs,
rownames(counts_df_vs) %in%
rownames(res1_select)) %>%
as.matrix()
# dfheat
p1 <- pheatmap(dfheat, #要绘制热图的矩阵
color = colorRampPalette(c('#0080FF','white','#BF0060'))(100), #热图色块颜色是从蓝到红分为100个等级
border_color = "white", #热图中每个色块的边框颜色,NA表示无边框
scale = "row", #按行进行归一化,"column"表示按列,"none"表示不进行归一化
cluster_rows = T, #是否对行进行聚类
cluster_cols = T, #是否对列进行聚类
legend = TRUE, #是否显示图例
legend_breaks = c(-1, 0, 1), #设置图例的断点
legend_labels = c("low","","heigh"), #设置图例断点处的标签
show_rownames = TRUE, #是否显示行名
show_colnames = TRUE, #是否显示列名
fontsize = 8, #字体大小,可以通过fontsize_row、fontsize_col参数分别设置行列名的字体大小
)
p1
# # 或者使用像素单位
# library(Cairo)
# ggsave(paste0(diff_name, "/Volcano_plot.png"),
# width = 1200, height = 1000, dpi = 300, device = CairoPNG)
# 保存为标准 PDF
# ggsave("plot.pdf", plot = p1, width = 12, height = 10, dpi = 300)
# diff_name
ggsave(paste0(diff_name,"/",diff_name,"-heat.pdf"),p1,height = length(res1_select$baseMean)*0.2,width = 10,limitsize = FALSE)
ggsave(paste0(diff_name,"/",diff_name,"-heat.png"),p1,height = 12,width = 10,limitsize = FALSE)
# if(length(res1_select$baseMean)>20){
# ggsave(paste0(diff_name,"/",diff_name,"-heat.pdf"),p1,height = length(res1_select$baseMean)*0.2,width = 10)
# ggsave(paste0(diff_name,"/",diff_name,"-heat.png"),p1,height = 12,width = 10)
# }else{
# ggsave(paste0(diff_name,"/",diff_name,"-heat.png"),p1,height = 12,width = 10,dpi=300)
# }
}
再创建调用函数文件:
use_RNAseq-20250512.R
source("Func.R")
#--------两两分析------------
#c("Naf_in","Isa_ig","Ose_ig","PBS_in","PBS_ig","Mock")
#Naf_in vs PBS_in
# 1-4, 2-5, 3-5, 4-6, 5-6, 1-6, 2-6, 3-6
DeseqAnalysis(1,4)
DeseqAnalysis(2,5)
DeseqAnalysis(3,5)
DeseqAnalysis(4,6)
DeseqAnalysis(5,6)
DeseqAnalysis(1,6)
DeseqAnalysis(2,6)
DeseqAnalysis(3,6)
counts表如下:

image.png