富集分析的“棒棒糖图”—R代码

今天无意间看到了一个图被惊艳到了,随后自己写代码,试了一下效果。做出来了,但有点细微区别,还请高人指点。


777639203090330380.jpg

R CODE如下



rm(list = ls())
options(stringsAsFactors = F)
# 加载必要包
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggplot2)
library(dplyr)
library(enrichplot)

# 读取差异表达数据
de_data <- read.csv("mRNADE.csv")

# 只保留显著差异的基因
sig_de_data <- de_data %>%
  filter(!change %in% c("Stable", "Not significant"))

# 基因ID转换
symbol_mapping <- bitr(sig_de_data$Name, 
                       fromType = "SYMBOL",
                       toType = "ENTREZID",
                       OrgDb = org.Hs.eg.db)

# 分组基因列表 - 只包含上调和下调基因
gene_list <- split(
  symbol_mapping$ENTREZID,
  sig_de_data$change[match(symbol_mapping$SYMBOL, sig_de_data$Name)]
)

# 并行KEGG富集分析
kegg_results <- compareCluster(
  geneClusters = gene_list,
  fun = "enrichKEGG",
  organism = "hsa",  # 人类KEGG代码
  pAdjustMethod = "BH",
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.2
)

# 确定上调和下调的列名
up_pattern <- "up|Up|UP|upregulated|Upregulated|UPREGULATED"
down_pattern <- "down|Down|DOWN|downregulated|Downregulated|DOWNREGULATED"

# 将结果转换为数据框
kegg_df <- as.data.frame(kegg_results)

# 分别为每个组别选择前10个通路
kegg_up <- kegg_df %>%
  filter(grepl(up_pattern, Cluster)) %>%
  top_n(10, -p.adjust)

kegg_down <- kegg_df %>%
  filter(grepl(down_pattern, Cluster)) %>%
  top_n(10, -p.adjust)

# 合并上调和下调的结果
kegg_top20 <- rbind(kegg_up, kegg_down)

# 计算RichFactor (GeneRatio的数值表示)
kegg_top20 <- kegg_top20 %>%
  mutate(
    GeneRatio_num = sapply(strsplit(GeneRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2])),
    # 根据实际的Cluster值判断上下调
    RichFactor = case_when(
      grepl(up_pattern, Cluster) ~ GeneRatio_num,
      grepl(down_pattern, Cluster) ~ -GeneRatio_num,
      TRUE ~ GeneRatio_num  # 默认情况
    )
  )

# 高级可视化 - KEGG富集图
custom_plot <- ggplot(kegg_top20, 
                      aes(x = RichFactor, 
                          y = reorder(Description, abs(RichFactor)),
                          size = Count,
                          color = -log10(p.adjust))) +
  geom_point(alpha = 0.8) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  scale_color_gradient(low = "#E6E6FA", high = "#4B0082",
                       name = "-log10(p.adj)") +
  scale_size_continuous(range = c(3, 8),
                        name = "Gene count") +
  # 添加分类颜色线条
  geom_segment(aes(x = 0, xend = RichFactor, 
                   y = reorder(Description, abs(RichFactor)), 
                   yend = reorder(Description, abs(RichFactor)),
                   color = NULL),
               size = 0.5, alpha = 0.6) +
  # 添加x轴标签
  scale_x_continuous(
    labels = function(x) abs(x),
    breaks = pretty(kegg_top20$RichFactor)
  ) +
  labs(x = "RichFactor", 
       y = "KEGG Pathways",
       title = "Top 10 KEGG Pathways for Up/Down Regulated Genes") +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid.major = element_line(color = "grey90"),
    axis.text.y = element_text(face = "bold", color = "black"),
    legend.position = "right",
    plot.title = element_text(hjust = 0.5, face = "bold")
  ) +
  # 添加下方的标签
  annotate("text", x = -max(abs(kegg_top20$RichFactor))/2, y = -1, label = "Down", size = 4) +
  annotate("text", x = max(abs(kegg_top20$RichFactor))/2, y = -1, label = "Up", size = 4)

custom_plot

# 输出高清图
ggsave("KEGG_analysis_richfactor.tiff", 
       plot = custom_plot,
       device = "tiff",
       dpi = 600,
       width = 14, 
       height = 10,
       compression = "lzw")

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容