启动子元件分析后数据可视化

对基因启动子元件分析,可以用plantCARE进行分析,结果文件直接发送邮件:
plantCARE链接:
http://bioinformatics.psb.ugent.be/webtools/plantcare/html/

PlantCARE分析启动子元件

1. R包的安装及工作路径设置

library(devtools)
install_github("jokergoo/ComplexHeatmap") #ComplexHeatmap包的加载直接从github下载

library(tidyverse)
library(cowplot)
library(openxlsx)
library(pheatmap)
library(ggsci)
library(data.table)
library(extrafont)
library(ggplot2)
library(ComplexHeatmap)
library(viridis)

setwd("工作路径") #按照自己工作路径设置

2. 读取读取和整理文件

pheatmap运行主要需要两类文件:
一个为启动子于PlantCARE里的结果文件(需要自行替换行标题——对应至基因)
一个是基于PlantCARE结果对调控元件进行分类(各启动子上存在的调控元件存在差异,需自行汇总)
文件读取如下:

#2.1读取顺式作用元件的分类文件
#顺式作用元件分类文件是基于启动子序列在plantcare上分析得到结果进行分类
#主要分为三大类:Abiotic and biotic stresses,Plant growth and development,Phytohormone responsive

cis_annotation <- read.xlsx('./cis_promoter_category.xlsx') %>%
  select(cis_element, category)

#2.2读取并过滤PlantCARE里面的结果文件
cutin_cir <-fread(file="Plantcare_result.txt",header=T,sep='\t',data.table=F)%>%
  left_join(cis_annotation, by = c('cis_element')) %>%   #与分类文件进行关联
  dplyr::filter(cis_element !=  "Unnamed__1")%>%         #过滤掉一些未知的顺式作用元件
  dplyr::filter(cis_element !=  "Unnamed__2")%>%
  dplyr::filter(cis_element !=  "Unnamed__3")%>%
  dplyr::filter(cis_element !=  "Unnamed__4")%>%
  dplyr::filter(cis_element !=  "Unnamed__5")%>%
  dplyr::filter(cis_element !=  "Unnamed__6")%>%
  dplyr::filter(cis_element !=  "Unnamed__7")%>%
  dplyr::filter(cis_element !=  "Unnamed__8")%>%
  dplyr::filter(cis_element !=  "Unnamed__9")%>%
  dplyr::filter(cis_element !=  "Unnamed__10")%>%
  dplyr::filter(cis_element !=  "Unnamed__11")%>%
  dplyr::filter(cis_element !=  "Unnamed__12") %>%
  dplyr::filter(sequence !=     "motif_sequence")%>%
  dplyr::filter(cis_element !=  "TATA-box")%>%  #filter common
  dplyr::filter(cis_element !=  "CAAT-box")%>%  #filter enhancer
  dplyr::filter(cis_element !=  "GC-motif")%>%  
  dplyr::filter(cis_element !=  "TCA") %>%
  dplyr::filter(!is.na(category)) %>%   # 过滤掉不在分类文件里面的cis
  select(gene_id, cis_element, sequence,  category) #distinct()

3. 自定义函数及富集分组统计

#3.1 定义如果整列都是NA就删除这列的一个函数
removeColsAllNa  <- function(x){x[, apply(x, 2, function(y) any(!is.na(y)))]}  

#3.2 对富集到进行分组统计 
#由于我设置了三个分类,基于分类做了stat1,stat2,stat3
#如果分组较多,可以自行增加分组统计

stat1 <- group_by(cutin_cir, gene_id, category, cis_element) %>%
  summarise(count = n()) %>%
  pivot_wider(names_from = cis_element, values_from = count) %>%  #长数据转化为宽数据
  filter(category %in% "Abiotic and biotic stresses") %>%
  removeColsAllNa() %>%   #去除整列都是NA的这列
  select(-category)


stat2<- group_by(cutin_cir, gene_id, category, cis_element) %>%
   summarise(count = n()) %>%
   pivot_wider(names_from = cis_element, values_from = count) %>%  #长数据转化为宽数据
   filter(category %in% "Plant growth and development") %>%
   removeColsAllNa() %>%   #去除整列都是NA的这列
   select(-category)


stat3<- group_by(cutin_cir, gene_id, category, cis_element) %>%
   summarise(count = n()) %>%
   pivot_wider(names_from = cis_element, values_from = count) %>%  #长数据转化为宽数据
   filter(category %in% "Phytohormone responsive") %>%
   removeColsAllNa() %>%   #去除整列都是NA的这列
   select(-category)

#3.3 数据转换
#将stat数据中NA的值改为0
stat1[is.na(stat1)] <- 0   
stat2[is.na(stat2)] <- 0 
stat3[is.na(stat3)] <- 0 

#将stat数据转化为matrix矩阵
stat1<- stat1[,c(2:29)] %>%
  column_to_rownames(var = 'gene_id')

stat2<- stat2[,c(2:19)] %>%
   column_to_rownames(var = 'gene_id')

stat3<- stat3[,c(2:13)] %>%
   column_to_rownames(var = 'gene_id')

4. 热图的绘制

具体绘图参数的修改可以参考ComplexHeatmap:
https://jokergoo.github.io/ComplexHeatmap-reference/book/

pdf('./Plant growth and development2.pdf', width = 5, height = 7)
p <- pheatmap(stat,
         cluster_cols = F,
         cluster_rows = F,
         cellwidth = 15,
         cellheight = 15,
         display_numbers = TRUE,
         number_format = "%.0f",
         legend = FALSE,
         color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
print(p)
dev.off()
### 示例导出PDF文件 

dev.new()
empty_chars <- character(length = 11)  
#创制指定个数空字符串
#主要用于消除labels_row的数值

#"Abiotic and biotic stresses"
pheatmap(stat1,
         cluster_cols = F,
         cluster_rows = F,
         cellwidth = 15,
         cellheight = 15,
         display_numbers = TRUE,
         number_format = "%.0f",
         legend = FALSE,
         fontfamily_row = "Times New Roman",# 行名字体,此外还有"Arial", "Courieri", "Standarad Symbols L"等字体
         fontface_row = "italic", #“plain”(普通)、“bold”(粗体)、“italic”(斜体)、“underline”(下划线)
         row_names_side="left",#通常默认为右侧,更改行标题至左侧;
         #row_dend_side="right" #更改聚类位置,启动子元件分析中未用到此功能
         color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
Abiotic and biotic stresses 调控元件分析
#"Plant growth and development"
pheatmap(stat2,
         cluster_cols = F,
         cluster_rows = F,
         cellwidth = 15,
         cellheight = 15,
         display_numbers = TRUE,
         number_format = "%.0f",
         legend = FALSE,
         labels_row= empty_chars,#取消行标题
         color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
取消行标题的"Plant growth and development"
#"Phytohormone responsive"
pheatmap(stat3,
         #fontsize_row = 20,
         #fontface="italic",
         #fontfamily="Times New Roman",
         cluster_cols = F,
         cluster_rows = F,
         cellwidth = 15,
         cellheight = 15,
         display_numbers = T,
         number_format = "%.0f",
         legend = F,
         color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100)
         )
无行标题的Phytohormone responsive

5. 堆叠图数据整理及绘制

#得到gene_id的因子,使得柱形图与热图的基因顺序一致
factor_levels <- group_by(cutin_cir, gene_id, category) %>%
  summarise(count = n()) %>%
  filter(category %in% "Abiotic and biotic stresses") %>%
  mutate(gene_id = factor(gene_id, levels = rev(gene_id)))
factor_levels <- levels(factor_levels$gene_id)

#整理绘图时的柱形图数据
stat_bar <- group_by(cutin_cir, gene_id, category) %>%
  summarise(count = n()) %>%
  mutate(cum_count = cumsum(count)) %>%
  mutate(position_x = cum_count-0.5*count) %>%
  mutate(gene_id = factor(gene_id, levels = rev(factor_levels))) %>%
  mutate(category = factor(category, levels = c('Plant growth and development', 
                                                'Phytohormone responsive',
                                                'Abiotic and biotic stresses')))


ggplot(data = stat_bar,
            #width=5, 
            #height=7,
            aes(x = count, y = gene_id)) +
  geom_col(aes(fill = category), position = 'stack', width = 0.7, alpha = 0.6)+
  geom_text(color = 'white', 
            aes(x = position_x, y = gene_id, label = count)) +
  scale_fill_lancet() +
  scale_x_continuous(expand = c(0,0))+  #使柱子从0的位置开始
  theme_half_open()+  #空白背景
  theme(legend.title = element_blank(),    ##########图例名称为空
        legend.text = element_text(size = 12, face = "bold"), ##########图例文字大小
        legend.position = 'left',      ############图例位置
        legend.key.size=unit(0.8,'cm'),   #############图例大小
        plot.margin = margin(t=80, r=80, b=80, l=80, unit = "pt"),
        #axis.text.y = element_blank(),
        axis.text.y=element_text(face="bold.italic"),#修改Y轴坐标字体格式
        axis.ticks=element_blank())+labs(y = NULL)
堆叠图结果展示
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容