对基因启动子元件分析,可以用plantCARE进行分析,结果文件直接发送邮件:
plantCARE链接:
http://bioinformatics.psb.ugent.be/webtools/plantcare/html/
1. R包的安装及工作路径设置
library(devtools)
install_github("jokergoo/ComplexHeatmap") #ComplexHeatmap包的加载直接从github下载
library(tidyverse)
library(cowplot)
library(openxlsx)
library(pheatmap)
library(ggsci)
library(data.table)
library(extrafont)
library(ggplot2)
library(ComplexHeatmap)
library(viridis)
setwd("工作路径") #按照自己工作路径设置
2. 读取读取和整理文件
pheatmap运行主要需要两类文件:
一个为启动子于PlantCARE里的结果文件(需要自行替换行标题——对应至基因)
一个是基于PlantCARE结果对调控元件进行分类(各启动子上存在的调控元件存在差异,需自行汇总)
文件读取如下:
#2.1读取顺式作用元件的分类文件
#顺式作用元件分类文件是基于启动子序列在plantcare上分析得到结果进行分类
#主要分为三大类:Abiotic and biotic stresses,Plant growth and development,Phytohormone responsive
cis_annotation <- read.xlsx('./cis_promoter_category.xlsx') %>%
select(cis_element, category)
#2.2读取并过滤PlantCARE里面的结果文件
cutin_cir <-fread(file="Plantcare_result.txt",header=T,sep='\t',data.table=F)%>%
left_join(cis_annotation, by = c('cis_element')) %>% #与分类文件进行关联
dplyr::filter(cis_element != "Unnamed__1")%>% #过滤掉一些未知的顺式作用元件
dplyr::filter(cis_element != "Unnamed__2")%>%
dplyr::filter(cis_element != "Unnamed__3")%>%
dplyr::filter(cis_element != "Unnamed__4")%>%
dplyr::filter(cis_element != "Unnamed__5")%>%
dplyr::filter(cis_element != "Unnamed__6")%>%
dplyr::filter(cis_element != "Unnamed__7")%>%
dplyr::filter(cis_element != "Unnamed__8")%>%
dplyr::filter(cis_element != "Unnamed__9")%>%
dplyr::filter(cis_element != "Unnamed__10")%>%
dplyr::filter(cis_element != "Unnamed__11")%>%
dplyr::filter(cis_element != "Unnamed__12") %>%
dplyr::filter(sequence != "motif_sequence")%>%
dplyr::filter(cis_element != "TATA-box")%>% #filter common
dplyr::filter(cis_element != "CAAT-box")%>% #filter enhancer
dplyr::filter(cis_element != "GC-motif")%>%
dplyr::filter(cis_element != "TCA") %>%
dplyr::filter(!is.na(category)) %>% # 过滤掉不在分类文件里面的cis
select(gene_id, cis_element, sequence, category) #distinct()
3. 自定义函数及富集分组统计
#3.1 定义如果整列都是NA就删除这列的一个函数
removeColsAllNa <- function(x){x[, apply(x, 2, function(y) any(!is.na(y)))]}
#3.2 对富集到进行分组统计
#由于我设置了三个分类,基于分类做了stat1,stat2,stat3
#如果分组较多,可以自行增加分组统计
stat1 <- group_by(cutin_cir, gene_id, category, cis_element) %>%
summarise(count = n()) %>%
pivot_wider(names_from = cis_element, values_from = count) %>% #长数据转化为宽数据
filter(category %in% "Abiotic and biotic stresses") %>%
removeColsAllNa() %>% #去除整列都是NA的这列
select(-category)
stat2<- group_by(cutin_cir, gene_id, category, cis_element) %>%
summarise(count = n()) %>%
pivot_wider(names_from = cis_element, values_from = count) %>% #长数据转化为宽数据
filter(category %in% "Plant growth and development") %>%
removeColsAllNa() %>% #去除整列都是NA的这列
select(-category)
stat3<- group_by(cutin_cir, gene_id, category, cis_element) %>%
summarise(count = n()) %>%
pivot_wider(names_from = cis_element, values_from = count) %>% #长数据转化为宽数据
filter(category %in% "Phytohormone responsive") %>%
removeColsAllNa() %>% #去除整列都是NA的这列
select(-category)
#3.3 数据转换
#将stat数据中NA的值改为0
stat1[is.na(stat1)] <- 0
stat2[is.na(stat2)] <- 0
stat3[is.na(stat3)] <- 0
#将stat数据转化为matrix矩阵
stat1<- stat1[,c(2:29)] %>%
column_to_rownames(var = 'gene_id')
stat2<- stat2[,c(2:19)] %>%
column_to_rownames(var = 'gene_id')
stat3<- stat3[,c(2:13)] %>%
column_to_rownames(var = 'gene_id')
4. 热图的绘制
具体绘图参数的修改可以参考ComplexHeatmap:
https://jokergoo.github.io/ComplexHeatmap-reference/book/
pdf('./Plant growth and development2.pdf', width = 5, height = 7)
p <- pheatmap(stat,
cluster_cols = F,
cluster_rows = F,
cellwidth = 15,
cellheight = 15,
display_numbers = TRUE,
number_format = "%.0f",
legend = FALSE,
color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
print(p)
dev.off()
### 示例导出PDF文件
dev.new()
empty_chars <- character(length = 11)
#创制指定个数空字符串
#主要用于消除labels_row的数值
#"Abiotic and biotic stresses"
pheatmap(stat1,
cluster_cols = F,
cluster_rows = F,
cellwidth = 15,
cellheight = 15,
display_numbers = TRUE,
number_format = "%.0f",
legend = FALSE,
fontfamily_row = "Times New Roman",# 行名字体,此外还有"Arial", "Courieri", "Standarad Symbols L"等字体
fontface_row = "italic", #“plain”(普通)、“bold”(粗体)、“italic”(斜体)、“underline”(下划线)
row_names_side="left",#通常默认为右侧,更改行标题至左侧;
#row_dend_side="right" #更改聚类位置,启动子元件分析中未用到此功能
color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
#"Plant growth and development"
pheatmap(stat2,
cluster_cols = F,
cluster_rows = F,
cellwidth = 15,
cellheight = 15,
display_numbers = TRUE,
number_format = "%.0f",
legend = FALSE,
labels_row= empty_chars,#取消行标题
color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100))
#"Phytohormone responsive"
pheatmap(stat3,
#fontsize_row = 20,
#fontface="italic",
#fontfamily="Times New Roman",
cluster_cols = F,
cluster_rows = F,
cellwidth = 15,
cellheight = 15,
display_numbers = T,
number_format = "%.0f",
legend = F,
color = colorRampPalette(colors = c("white","#7baf57","#448e12"))(100)
)
5. 堆叠图数据整理及绘制
#得到gene_id的因子,使得柱形图与热图的基因顺序一致
factor_levels <- group_by(cutin_cir, gene_id, category) %>%
summarise(count = n()) %>%
filter(category %in% "Abiotic and biotic stresses") %>%
mutate(gene_id = factor(gene_id, levels = rev(gene_id)))
factor_levels <- levels(factor_levels$gene_id)
#整理绘图时的柱形图数据
stat_bar <- group_by(cutin_cir, gene_id, category) %>%
summarise(count = n()) %>%
mutate(cum_count = cumsum(count)) %>%
mutate(position_x = cum_count-0.5*count) %>%
mutate(gene_id = factor(gene_id, levels = rev(factor_levels))) %>%
mutate(category = factor(category, levels = c('Plant growth and development',
'Phytohormone responsive',
'Abiotic and biotic stresses')))
ggplot(data = stat_bar,
#width=5,
#height=7,
aes(x = count, y = gene_id)) +
geom_col(aes(fill = category), position = 'stack', width = 0.7, alpha = 0.6)+
geom_text(color = 'white',
aes(x = position_x, y = gene_id, label = count)) +
scale_fill_lancet() +
scale_x_continuous(expand = c(0,0))+ #使柱子从0的位置开始
theme_half_open()+ #空白背景
theme(legend.title = element_blank(), ##########图例名称为空
legend.text = element_text(size = 12, face = "bold"), ##########图例文字大小
legend.position = 'left', ############图例位置
legend.key.size=unit(0.8,'cm'), #############图例大小
plot.margin = margin(t=80, r=80, b=80, l=80, unit = "pt"),
#axis.text.y = element_blank(),
axis.text.y=element_text(face="bold.italic"),#修改Y轴坐标字体格式
axis.ticks=element_blank())+labs(y = NULL)