计算的思维导图
# Summarizing interval coverage across genomic features
bedfile <- valr_example("genes.hg19.chr22.bed.gz")
genomefile <- valr_example("hg19.chrom.sizes.gz")
bgfile <- valr_example("hela.h3k4.chip.bg.gz")
genes <- read_bed(bedfile, n_fields = 6)
genome <- read_genome(genomefile)
y <- read_bedgraph(bgfile)
# generate 1 bp TSS intervals, "+" strand only
tss <- genes %>%
filter(strand == "+") %>%
mutate(end = start + 1)
# 1000 bp up and downstream
region_size <- 1000
# 50 bp windows
win_size <- 50
# add slop to the TSS, break into windows and add a group
# bed_slop函数扩展区域空间,延长区间上下游,这里延长上下游1KB
# bed_makewindows 扩展后的bed区间以50bp为一个bin进行划分
x <- tss %>%
bed_slop(genome, both = region_size) %>%
bed_makewindows(genome, win_size)
# map signals to TSS regions and calculate summary statistics.
# 使用bed_map函数统计每一个bin中的总reads数目
res <- bed_map(x, y, win_sum = sum(value, na.rm = TRUE)) %>%
group_by(.win_id) %>%
summarize(win_mean = mean(win_sum, na.rm = TRUE),
win_sd = sd(win_sum, na.rm = TRUE))
## ggplot2进行可视化
library(ggplot2)
x_labels <- seq(-region_size, region_size, by = win_size * 5)
x_breaks <- seq(1, 41, by = 5)
sd_limits <- aes(ymax = win_mean + win_sd, ymin = win_mean - win_sd)
p <- ggplot(res, aes(x = .win_id, y = win_mean)) +
geom_point(size = 0.25) + geom_pointrange(sd_limits, size = 0.1) +
scale_x_continuous(labels = x_labels, breaks = x_breaks) +
xlab("Position (bp from TSS)") + ylab("Signal") +
theme_classic()
ggplot(res, aes(x = .win_id, y = win_mean)) +
geom_point(size = 0.25) + geom_line(size = 1) +
scale_x_continuous(labels = x_labels, breaks = x_breaks) +
xlab("Position (bp from TSS)") + ylab("Signal") +
theme_bw()