DESeq分析小鼠组织RNAseq数据

DESeq分析
环境: 一份Counts表格,含有各组样品名和基因名

根据指定的两组分析,并画火山图和热图

1. 读取xlsx文件

2. 提取行名,转为数值后再设行名

3. 创建样本分组信息,文件夹文件名

4. 构建对象,进行DESeq分析

5. 提取差异基因并保存txt,csv

6. 画火山图,可单独指定FCcutoff和pvaluecutoff,保存

7. 画热图,注意设定参数limitsize = FALSE,保存为PDF和PNG

Naf_in-vs-PBS_in-volcano.png

Naf_in-vs-PBS_in-heat.png
e86b7f433742e25969268f4949d1905.png

先建立一个函数文件:
Func.R


library(DESeq2) 
require(tidyverse)
require(ggplot2)
# require(ggVolcano)
require(pheatmap)
require(RColorBrewer)
require(EnhancedVolcano)
library(openxlsx)






DeseqAnalysis <- function(group1_index,group2_index){
  # 根据指定的两组画火山图
  # 1. 读取xlsx文件
  # 2. 提取行名,转为数值后再设行名
  # 3. 创建样本分组信息,文件夹文件名
  # 4. 构建对象,进行DESeq分析
  # 5. 提取差异基因并保存txt,csv
  # 6. 画火山图,可单独指定FCcutoff和pvaluecutoff,保存
  
  #Naf_in  PBS_in 为1和4
  
  # 加载 此前处理好的.xlsx 基因Counts文件
  df <- read.xlsx("MKX_mouse_result - gene-names2-20250512.xlsx", rowNames = TRUE)
  
  
  # 提取列名
  col_names <- colnames(df)
  
  # 提取行名
  row_names <- rownames(df)
  # row_names
  
  # 将数据框的主体部分转换为数值类型
  df_numeric <- as.data.frame(apply(df, 2, as.numeric))
  # df_numeric
  # colnames(df_numeric)
  # rownames(df_numeric)
  # # 重新设置列名
  # colnames(df_numeric) <- col_names
  
  # 重新设置行名,生成后续分析的counts_df
  rownames(df_numeric) <- row_names
  counts_df <-df_numeric
  
  
  # 创建样本分组信息
  sample_names <- colnames(counts_df)
  sample_names
  # counts_df
  
  
  
  
  #--------两两分析------------
  #c("Naf_in","Isa_ig","Ose_ig","PBS_in","PBS_ig","Mock")
  #Naf_in vs PBS_in
  # 1-4, 2-5, 3-5, 4-6, 5-6, 1-6, 2-6, 3-6
  
  #test
  # group1_index = 2
  # group2_index = 5
  
  
  # ---------------1-4---------
  # --------------------------------先分析 Naf_in 和 PBS_in组------------------------------
  # 第1组和第4组"Naf_in"  "PBS_in"
  index_group1 <- group1_index
  index_group2 <- group2_index
  
  # 样本重复数为3
  rep_num <- 3
  
  # 去除后缀,获取组样本名
  name_group1 = str_remove(colnames(df)[index_group1*rep_num],"_3$")
  name_group2 = str_remove(colnames(df)[index_group2*rep_num],"_3$")
  name_group1
  name_group2
  # 生成文件名
  diff_name = paste0(name_group1,"-vs-",name_group2)
  
  # 创建文件夹,两两分析
  dir.create(diff_name)
  # 创建RDS文件名
  f_name = paste0(diff_name,"/",diff_name,".RDS")
  f_name
  
  # 创建样本分组信息
  # sample_names <- colnames(counts_df)
  # sample_names
  # counts_df
  
  
  coldata <- data.frame(condition = factor(rep(c(name_group1,name_group2),each=3)), 
                        levels = c(rep(name_group1,3),rep(name_group2,3)))
  coldata
  # 选取包含 "name_group1" 或 "name_group2" 的列(并集)   Naf_in    PBS_in
  counts_df_vs <- counts_df %>% 
    select(contains(name_group1) | contains(name_group2))
  
  # 正式分析---------------DESeq------------------
  # 第一步,构建 DESeqDataSet 对象
  dds <- DESeqDataSetFromMatrix(countData = counts_df_vs, colData = coldata, design= ~condition)
  
  
  # ----------------- 关键过滤步骤 -----------------
  # 查看过滤效果
  cat("过滤前基因数:", nrow(counts(dds, normalized = FALSE)), "\n")
  before_gene_num <- nrow(counts(dds, normalized = FALSE))
  # 方法1:基于原始counts过滤
  # 保留至少在3个样本中count≥10的基因
  keep <- rowSums(counts(dds) >= 10) >= 3
  dds <- dds[keep, ]
  # dds
  cat("过滤后基因数:", nrow(dds), "\n")
  after_gene_num <- nrow(dds)
  # after_gene_num
  # rownames(dds)
  # rownames(dds1)
  # 
  # 第二步,计算差异倍数并获得 p 值
 
  # dds1 <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = T)
  dds1 <- DESeq(dds, fitType = 'mean', minReplicatesForReplace = 7, parallel = T)
  
  #注意,需将 treat 在前,control 在后,意为 treat 相较于 control 中哪些基因上调/下调
  res <- results(dds1, contrast = c('condition', name_group1, name_group2))
  res1 <- data.frame(res, stringsAsFactors = FALSE, check.names = FALSE)
  
  # # 重新设置行名(一定要在排序前)
  # rownames(res1) <- row_names
  # #排序
  # res1 <- res1[order(res1$padj, res1$log2FoldChange, decreasing = c(FALSE, TRUE)), ]#排序
  
  #其余标识 none,代表非差异的基因
  # res1[which(abs(res1$log2FoldChange) <= 1 | res1$padj >= 0.05),'sig'] <- 'none'
  # res1[which(res1$log2FoldChange >= 0.5 & res1$padj < 0.05),'sig'] <- 'up'
  # res1[which(res1$log2FoldChange <= -0.5 & res1$padj < 0.05),'sig'] <- 'down'
  # 设置参数cutoff
  FC_cutoff = 1.333
  P_cutoff = 0.01
  
  
  res1[which(abs(res1$log2FoldChange) <= FC_cutoff | res1$pvalue >= P_cutoff),'sig'] <- 'none'
  res1[which(res1$log2FoldChange >= FC_cutoff & res1$pvalue < P_cutoff),'sig'] <- 'up'
  res1[which(res1$log2FoldChange <= -FC_cutoff & res1$pvalue < P_cutoff),'sig'] <- 'down'
  
  #输出选择的差异基因总表
  res1_select <- subset(res1, sig %in% c('up', 'down'))
  #保存为txt--差异基因
  Deg_gene = paste0(diff_name,"/",diff_name,"Deg_gene.txt")
  write.table(res1_select, file = Deg_gene, sep = ';', col.names = NA, quote = FALSE)
  #保存为csv--所有基因
  TwoGroupFileName = paste0(diff_name,"/",diff_name,".csv")
  write.csv(res1, file = TwoGroupFileName, row.names = TRUE)
  
  # dds1 <- readRDS("./mock-beta_12h/mock-beta_12h.RDS")
  # isg <- read.csv2("./mock-beta_12h/mock-beta_12hDeg_gene.txt",sep=";")
  # isg %>%filter(str_detect(rownames(isg),"ISG") )
  
  
  diff_name
  title = paste0(diff_name,'(DESeq2 Results) ')
  title
  subtitle = paste0("Differential expression (Filtered Genes Num: ",after_gene_num,")")
  subtitle
  ##火山图, 里面确定pCut和FCcut
  p_evo <-  EnhancedVolcano(res1,
                            lab = rownames(res1),
                            x = "log2FoldChange",
                            # y = "padj",
                            y = "pvalue",
                            #selectLab = rownames(lrt)[1:4],
                            xlab = bquote(~Log[2]~ "Fold Change"),
                            ylab = bquote(~-Log[10] ~ italic(p-value)),
                            # pCutoff = 0.001,## pvalue
                            # FCcutoff = 1,## FC cutoff
                            pCutoff = 0.0001,## pvalue
                            FCcutoff = 1.333,## FC cutoff
                            # xlim = c(-5,5),
                            # ylim = c(0,7.5),
                            xlim = c(-6,6),
                            ylim = c(0,8),
                            pointSize = 3.5,
                            labSize =3.5,
                            title = title,
                            subtitle = subtitle,
                            caption = 'FC cutoff, 1.333; p-value cutoff, 10e-4',
                            # legendPosition = "right",
                            legendLabSize = 14,
                            # col = c('grey30', 'forestgreen', 'royalblue', 'red2'),
                            colAlpha = 0.8,
                            drawConnectors = TRUE,
                            lengthConnectors = unit(0.05, "inches"),
                            # hline = c(10e-8),
                            widthConnectors = 0.5,
                            max.overlaps = 50,
                            maxoverlapsConnectors=50
  )
  p_evo
  ggsave(paste0(diff_name,"/",diff_name,"-volcano.png"),p_evo,height = 10,width = 8)
  
  # 选择差异基因画热图
  dfheat <- filter(counts_df_vs,
            rownames(counts_df_vs) %in%
            rownames(res1_select)) %>%
            as.matrix()
  # dfheat
  p1 <- pheatmap(dfheat,  #要绘制热图的矩阵
                 color = colorRampPalette(c('#0080FF','white','#BF0060'))(100), #热图色块颜色是从蓝到红分为100个等级
                 border_color = "white",  #热图中每个色块的边框颜色,NA表示无边框
                 scale = "row", #按行进行归一化,"column"表示按列,"none"表示不进行归一化
                 cluster_rows = T, #是否对行进行聚类
                 cluster_cols = T, #是否对列进行聚类
                 legend = TRUE, #是否显示图例
                 legend_breaks = c(-1, 0, 1), #设置图例的断点
                 legend_labels = c("low","","heigh"), #设置图例断点处的标签
                 show_rownames = TRUE, #是否显示行名
                 show_colnames = TRUE, #是否显示列名
                 fontsize = 8, #字体大小,可以通过fontsize_row、fontsize_col参数分别设置行列名的字体大小

  )
  p1
  
  
  # # 或者使用像素单位
  # library(Cairo)
  # ggsave(paste0(diff_name, "/Volcano_plot.png"), 
  #        width = 1200, height = 1000, dpi = 300, device = CairoPNG)
  
  # 保存为标准 PDF
  # ggsave("plot.pdf", plot = p1, width = 12, height = 10, dpi = 300)
  # diff_name
  ggsave(paste0(diff_name,"/",diff_name,"-heat.pdf"),p1,height = length(res1_select$baseMean)*0.2,width = 10,limitsize = FALSE)
  ggsave(paste0(diff_name,"/",diff_name,"-heat.png"),p1,height = 12,width = 10,limitsize = FALSE)
  
  # if(length(res1_select$baseMean)>20){
  #   ggsave(paste0(diff_name,"/",diff_name,"-heat.pdf"),p1,height = length(res1_select$baseMean)*0.2,width = 10)
  #   ggsave(paste0(diff_name,"/",diff_name,"-heat.png"),p1,height = 12,width = 10)
  # }else{
  #   ggsave(paste0(diff_name,"/",diff_name,"-heat.png"),p1,height = 12,width = 10,dpi=300)
  # }
  
}

再创建调用函数文件:
use_RNAseq-20250512.R

source("Func.R")



#--------两两分析------------
#c("Naf_in","Isa_ig","Ose_ig","PBS_in","PBS_ig","Mock")
#Naf_in vs PBS_in
# 1-4, 2-5, 3-5, 4-6, 5-6, 1-6, 2-6, 3-6
DeseqAnalysis(1,4)
DeseqAnalysis(2,5)
DeseqAnalysis(3,5)
DeseqAnalysis(4,6)
DeseqAnalysis(5,6)
DeseqAnalysis(1,6)
DeseqAnalysis(2,6)
DeseqAnalysis(3,6)

counts表如下:


image.png
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容