在开始笔记之前我只想吐槽一下,官网里这一部分的代码错的离谱!!!各种报错报的我怀疑人生。。。
第四节 比对结果过滤和文件格式转换
(一)利用比对质量过滤比对上的【可选步骤】
有些项目可能需要对比对质量分数进行更严格的筛选。这里通过实例详细讨论了bowtie是如何分配质量分数的。
MAPQ(x) = -10 * log10(P(x is mapped wrongly)) = -10 * log10(p)
which ranges from 0 to 37, 40 or 42.
samtools view -q minQualityScore
将消除所有低于用户定义的minQualityScore的比对结果。
#!/bin/bash
projPath="/gpfs/home/fangy04/cut_tag/alignment/"
minQualityScore=2
cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
samtools view -q $minQualityScore ${projPath}/${i}_bowtie2.sam >${projPath}/filtered_sam/${i}_bowtie2.qualityScore$minQualityScore.sam
done
(二)文件格式转换【必需步骤】
这一部分是为peak calling和可视化做准备所必需的,其中需要完成一些过滤和文件格式转换。
#!/bin/bash
projPath="/gpfs/home/fangy04/cut_tag/"
cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
## Filter and keep the mapped read pairs
samtools view -bS -F 0x04 $projPath/alignment/${name}_bowtie2.sam > $projPath/bam_files/${name}_bowtie2.mapped.bam
## Convert into bed file format
bedtools bamtobed -bedpe -i $projPath/bam_files/${name}_bowtie2.mapped.bam > $projPath/bed_files/${name}_bowtie2.bed
## Keep the read pairs that are on the same chromosome and fragment length less than 1000bp.
awk '$1==$4 && $6-$2 < 1000 {print $0}' $projPath/bed_files/${name}_bowtie2.bed > $projPath/bed_files/${name}_bowtie2.clean.bed
## Only extract the fragment related columns
cut -f 1,2,6 $projPath/bed_files/${name}_bowtie2.clean.bed | sort -k1,1 -k2,2n -k3,3n > $projPath/bed_files/${name}_bowtie2.fragments.bed
done
(三)评估重复样品的再现性(接上一篇笔记)
为了研究重复样品之间和跨条件下的重现性,基因组被分割成500 bp的bin,并在重复数据集之间计算每个bin中log2转换值的Pearson相关性。多个重复和IgG对照数据集显示在一个层次聚类相关矩阵中。
#!/bin/bash
projPath="/gpfs/home/fangy04/cut_tag"
binLen=500
cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
awk -v w=$binLen '{print $1, int(($2 + $3)/(2*w))*w + w/2}' $projPath/bed_files/${i}_bowtie2.fragments.bed | sort -k1,1V -k2,2n | uniq -c | awk -v OFS="\t" '{print $2, $3, $1}'| sort -k1,1V -k2,2n > $projPath/bed_files/${i}_bowtie2.fragmentsCount.bin$binLen.bed
done
在R中可视化:
> library(magrittr)
> library(dplyr)
> library(tidyverse)
> library(corrplot)
> sampleList = c("SH_Hs_K27m3_rep1", "SH_Hs_K27m3_rep2", "SH_Hs_K4m3_rep1", "SH_Hs_K4m3_rep2", "SH_Hs_IgG_rep1","SH_Hs_IgG_rep2_R1","SH_Hs_IgG_rep2_R2")
histList = c("K27m3", "K4m3", "IgG")
> reprod = c()
> fragCount = NULL
> for(hist in sampleList){
if(is.null(fragCount)){
fragCount = read.table(paste0(hist, "_bowtie2.fragmentsCount.bin500.bed"), header = FALSE)
colnames(fragCount) = c("chrom", "bin", hist)
}else{
fragCountTmp = read.table(paste0(hist, "_bowtie2.fragmentsCount.bin500.bed"), header = FALSE)
colnames(fragCountTmp) = c("chrom", "bin", hist)
fragCount = full_join(fragCount, fragCountTmp, by = c("chrom", "bin"))
}
}
> M = cor(fragCount %>% select(-c("chrom", "bin")) %>% log2(), use = "complete.obs")
> corrplot(M, method = "color", outline = T, addgrid.col = "darkgray", order="hclust", addrect = 3, rect.col = "black", rect.lwd = 3,cl.pos = "b", tl.col = "indianred4", tl.cex = 1, cl.cex = 1, addCoef.col = "black", number.digits = 2, number.cex = 1, col = colorRampPalette(c("midnightblue","white","darkred"))(100))
第五节 Spike-in校准
这部分是可选的,但根据你的实验方法来决定是否进行这一步。我们已经在第3.1.2节展示了spike-in基因组的比对方式,并在第3.2.2节展示了spike-in比对总结。
潜在的假设是,在一系列样本中,原基因组比对上的片段与大肠杆菌基因组比对上的片段的比例是相同的,每个样本使用相同数量的细胞。由于这个假设,我们没有在实验之间或纯化的pA-Tn5批次之间进行标准化处理,因为这些批次的大肠杆菌DNA携带量可能非常不同。使用常数C来避免归一化数据中的小分数,我们定义一个scaling factor S为:
S = C / (fragments mapped to E. coli genome)
然后计算Normalized coverage:
Normalized coverage = (primary_genome_coverage) * S
这个常数是一个任意的乘数,通常是10000。产生的文件相对较小,如基因组coverage bedgraph文件。
#!/bin/bash
projPath="/gpfs/home/fangy04/cut_tag"
chromSize="/gpfs/home/fangy04/cut_tag/hg38.chrom.size"
cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
seqDepthDouble=`samtools view -F 0x04 $projPath/alignment/${i}_spikein.sam | wc -l`
seqDepth=$((seqDepthDouble/2))
if [[ "$seqDepth" -gt "1" ]]; then
scale_factor=`echo "10000 / $seqDepth" | bc -l`
echo "Scaling factor for $histName is: $scale_factor!"
bedtools genomecov -bg -scale $scale_factor -i $projPath/bed_files/${i}_bowtie2.fragments.bed -g $chromSize > $projPath/bedgraph/${i}_bowtie2.fragments.normalized.bedgraph
fi
done
(一)Scaling factor
R:
> library(magrittr)
> library(dplyr)
> library(tidyverse)
> library(corrplot)
> sampleList = c("SH_Hs_K27m3_rep1", "SH_Hs_K27m3_rep2", "SH_Hs_K4m3_rep1", "SH_Hs_K4m3_rep2", "SH_Hs_IgG_rep1","SH_Hs_IgG_rep2_R1","SH_Hs_IgG_rep2_R2")
> histList = c("K27m3", "K4m3", "IgG")
> scaleFactor = c()
> multiplier = 10000
> for(hist in sampleList){
spikeDepth = read.table(paste0(hist, "_bowtie2_spikeIn.seqDepth"), header = FALSE, fill = TRUE)$V1[1]
histInfo = strsplit(hist, "_")[[1]]
scaleFactor = data.frame(scaleFactor = multiplier/spikeDepth, Histone = histInfo[3], Replicate = histInfo[4]) %>% rbind(scaleFactor, .)
}
> scaleFactor$Histone = factor(scaleFactor$Histone, levels = histList)
> combine_scaleFactor <- left_join(alignDupSummary, scaleFactor, by = c("Histone", "Replicate"))
> combine_scaleFactor
Histone Replicate SequencingDepth MappedFragNum_hg38 AlignmentRate_hg38 MappedFragNum_spikeIn AlignmentRate_spikeIn DuplicationRate
1 K27m3 rep1 2984630 2860326 95.96% 235 0.01% 4.8544%
2 K27m3 rep2 2702260 2607267 96.57% 487 0.02% 1.0424%
3 K4m3 rep1 1581710 1494319 94.86% 375 0.02% 6.5808%
4 K4m3 rep2 1885056 1742643 92.85% 4441 0.24% 2.6899%
5 IgG rep1 2127635 1749023 82.73% 75767 3.56% 81.4229%
6 IgG rep2 2192908 1994753 91.17% 79137 3.61% 34.2772%
7 IgG rep2 2192908 1994753 91.17% 79137 3.61% 34.2772%
8 IgG rep2 1168663 1076138 92.16% 42122 3.6% 39.6811%
9 IgG rep2 1168663 1076138 92.16% 42122 3.6% 39.6811%
EstimatedLibrarySize UniqueFragNum scaleFactor
1 28538043 2725126.0 42.5531915
2 124296273 2582370.8 20.5338809
3 10894128 1401681.3 26.6666667
4 31948293 1703223.5 2.2517451
5 328542 326994.5 0.1319836
6 2202634 1313921.7 0.1263631
7 2202634 1313921.7 0.2374056
8 967425 649647.8 0.1263631
9 967425 649647.8 0.2374056
可视化:
## Generate sequencing depth boxplot
> fig6A = scaleFactor %>% ggplot(aes(x = Histone, y = scaleFactor, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 20) +
ylab("Spike-in Scalling Factor") +
xlab("")
> normDepth = inner_join(scaleFactor, alignResult, by = c("Histone", "Replicate")) %>% mutate(normDepth = MappedFragNum_hg38 * scaleFactor)
> fig6B = normDepth %>% ggplot(aes(x = Histone, y = normDepth, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 20) +
ylab("Normalization Fragment Count") +
xlab("") +
coord_cartesian(ylim = c(1000000, 130000000))
> ggarrange(fig6A, fig6B, ncol = 2, common.legend = TRUE, legend="bottom")
第六节 Peak Calling
(一)SEACR
对CUT&RUN的稀疏富集分析SEACR包的设计用于从非常低的背景(即没有read覆盖的区域)的染色质中富集区域并call peaks,这是典型的CUT&Tag染色质分析实验。SEACR需要来自paired-end测序的bedGraph文件作为输入,并将峰定义为不与IgG对照数据集中描绘的背景信号块重叠的连续碱基对覆盖块。SEACR既能有效地从因子结合位点calling窄峰,也能calling出某些组蛋白修饰特征的宽域。该方法的描述发表在2019年的Meers et al. 2019上,用户手册可在github上查阅。由于我们已经使用大肠杆菌read count对片段计数进行了规范化,所以我们将SEACR的规范化选项设置为“non”。否则,建议使用“norm”。
首先要先下载seacr.sh和Rscript,下载地址:https://github.com/FredHutch/SEACR/
另外,你需要把seacr.sh里的一部分修改掉,换成你的文件所在目录:
第三,你还要保证你的电脑里安装了R和bedtools,因为seacr.sh脚本要调用它们。
然后写脚本(这里脚本我做了改动,因为官网的代码实在是。。。无力吐槽了):
#!/bin/bash
projPath="/gpfs/home/fangy04/cut_tag"
seacr="/gpfs/home/fangy04/cut_tag/seacr.sh"
cat /gpfs/home/fangy04/cut_tag/filenames | while read i
do
bash $seacr $projPath/bedgraph/${i}_bowtie2.fragments.normalized.bedgraph \
$projPath/bedgraph/SH_Hs_IgG_rep1_bowtie2.fragments.normalized.bedgraph \ #这里和官网不一样,因为如果按照官网的代码是报错的,我查了seacr的官网,这里应该是IgG的bedgraph文件,所以我直接用的IgG_rep1的bedgraph文件
non stringent $projPath/peak_calling/${i}_seacr_control.peaks
bash $seacr $projPath/bedgraph/${i}_bowtie2.fragments.normalized.bedgraph 0.01 non stringent $projPath/peak_calling/${i}_seacr_top0.01.peaks
done
然后生成这些文件:
$ ll
total 33024
-rw------- 1 fangy04 fangy04 163240 Dec 24 10:15 SH_Hs_IgG_rep1_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 8463714 Dec 24 10:16 SH_Hs_IgG_rep2_R1_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 620497 Dec 24 10:16 SH_Hs_IgG_rep2_R1_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 2543921 Dec 24 10:17 SH_Hs_IgG_rep2_R2_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 315210 Dec 24 10:17 SH_Hs_IgG_rep2_R2_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 11751678 Dec 24 10:17 SH_Hs_K27m3_rep1_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 381246 Dec 24 10:18 SH_Hs_K27m3_rep1_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 7449627 Dec 24 10:19 SH_Hs_K27m3_rep2_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 604217 Dec 24 10:19 SH_Hs_K27m3_rep2_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 329727 Dec 24 10:19 SH_Hs_K4m3_rep1_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 58395 Dec 24 10:19 SH_Hs_K4m3_rep1_seacr_top0.01.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 516929 Dec 24 10:20 SH_Hs_K4m3_rep2_seacr_control.peaks.stringent.bed
-rw------- 1 fangy04 fangy04 234455 Dec 24 10:20 SH_Hs_K4m3_rep2_seacr_top0.01.peaks.stringent.bed
(1)Number of peaks called
然后在R里统计:
##=== R command ===##
> sampleList = c("SH_Hs_K27m3_rep1", "SH_Hs_K27m3_rep2", "SH_Hs_K4m3_rep1", "SH_Hs_K4m3_rep2", "SH_Hs_IgG_rep1","SH_Hs_IgG_rep2_R1","SH_Hs_IgG_rep2_R2")
> peakN = c()
> peakWidth = c()
> peakType = c("control", "top0.01")
> for(hist in sampleList){
histInfo = strsplit(hist, "_")[[1]]
if(histInfo[3] != "IgG"){
for(type in peakType){
peakInfo = read.table(paste0(hist, "_seacr_", type, ".peaks.stringent.bed"), header = FALSE, fill = TRUE) %>% mutate(width = abs(V3-V2))
peakN = data.frame(peakN = nrow(peakInfo), peakType = type, Histone = histInfo[3], Replicate = histInfo[4]) %>% rbind(peakN, .)
peakWidth = data.frame(width = peakInfo$width, peakType = type, Histone = histInfo[4], Replicate = histInfo[4]) %>% rbind(peakWidth, .)
}
}
}
> peakN %>% select(Histone, Replicate, peakType, peakN)
Histone Replicate peakType peakN
1 K27m3 rep1 control 185890
2 K27m3 rep1 top0.01 5849
3 K27m3 rep2 control 117190
4 K27m3 rep2 top0.01 9619
5 K4m3 rep1 control 5048
6 K4m3 rep1 top0.01 880
7 K4m3 rep2 control 8187
8 K4m3 rep2 top0.01 3736
(2)在生物学重复里peaks的再现性
对重复数据集的peak calling与定义可重复峰进行比较。选取峰的前1%(按每个区块的总信号进行排序)作为高置信度点。
这一块实际上我是比较困惑的。首先就是官网的代码,如果你完全按照代码来,最后算出来的peakReprodRate(可重现峰比例)和peakreprod(可重现峰数量)都是NA,所以下面一块的代码我改动较多,同学可以去官网运行他的代码体验一下。第二就是如果想看不同生物学重复之间的再现性,是不是只需要看rep2对于rep1的再现性呢?因为我改动代码后,所有rep1的peakReprodRate都是100%,而所有rep2都是不到100%的一个数值。如果有同学也练习过这个流程欢迎讨论~
> library(GenomicRanges)
> library(magrittr)
> library(dplyr)
> histL = c("SH_Hs_K27m3", "SH_Hs_K4m3")
> repL = c("rep1","rep2")
> peakType = c("control", "top0.01")
> peakOverlap = c()
> for(type in peakType){
for(hist in histL){
overlap.gr = GRanges()
for(rep in repL){
peakInfo = read.table(paste0(hist, "_", rep, "_seacr_", type, ".peaks.stringent.bed"), header = FALSE, fill = TRUE)
peakInfo.gr = GRanges(peakInfo$V1, IRanges(start = peakInfo$V2, end = peakInfo$V3), strand = "*")
if(length(overlap.gr) >0){
overlap.gr = overlap.gr[findOverlaps(overlap.gr, peakInfo.gr)@from]
}else{
overlap.gr = peakInfo.gr
}
peakOverlap = data.frame(peakReprod = length(overlap.gr), Histone = hist, Replicate = rep, peakType = type) %>% rbind(peakOverlap, .)
}
}
}
> peakOverlap = peakOverlap[c(1,5,2,6,3,7,4,8),]#必须
> peakReprod = data.frame(peakOverlap[,c(2,3,4,1)], peakN = peakN$peakN) %>% mutate(peakReprodRate = peakReprod/peakN * 100)
> peakReprod
Histone Replicate peakType peakReprod peakN peakReprodRate
1 SH_Hs_K27m3 rep1 control 185890 185890 100.00000
2 SH_Hs_K27m3 rep1 top0.01 5849 5849 100.00000
3 SH_Hs_K27m3 rep2 control 106767 117190 91.10590
4 SH_Hs_K27m3 rep2 top0.01 5271 9619 54.79780
5 SH_Hs_K4m3 rep1 control 5048 5048 100.00000
6 SH_Hs_K4m3 rep1 top0.01 880 880 100.00000
7 SH_Hs_K4m3 rep2 control 5046 8187 61.63430
8 SH_Hs_K4m3 rep2 top0.01 875 3736 23.42077
(3)FRagment proportion in Peaks regions (FRiPs)
我们计算峰中的reads (FRiPs)作为信噪比的度量,并将其与IgG数据集中的FRiPs进行对比以作说明。虽然CUT&Tag的测序深度通常只有1- 500万reads,但该方法的低背景导致高FRiP分数。
> library(chromVAR)
> library(GenomicRanges)
> library(magrittr)
> library(dplyr)
> bamDir = paste0(projPath, "F:/cut_tag/bam_files")#bam文件的目录
> inPeakData = c()
> histL = c("SH_Hs_K27m3", "SH_Hs_K4m3")
> repL = c("rep1","rep2")
## overlap with bam file to get count
> for(hist in histL){
for(rep in repL){
peakRes = read.table(paste0(hist, "_", rep, "_seacr_control.peaks.stringent.bed"), header = FALSE, fill = TRUE)
peak.gr = GRanges(seqnames = peakRes$V1, IRanges(start = peakRes$V2, end = peakRes$V3), strand = "*")
bamFile = paste0(bamDir, "/", hist, "_", rep, "_bowtie2.mapped.bam")
fragment_counts <- getCounts(bamFile, peak.gr, paired = TRUE, by_rg = FALSE, format = "bam")
inPeakN = counts(fragment_counts)[,1] %>% sum
inPeakData = rbind(inPeakData, data.frame(inPeakN = inPeakN, Histone = hist, Replicate = rep))
}
}
> frip = data.frame(alignResult[c(1:4),]) %>% mutate(FragInPeakNum = inPeakData$inPeakN) %>% mutate(FRiPs = inPeakN/MappedFragNum_hg38 * 100)
(4)可视化peak number, peak width, peak reproducibility and FRiPs.
> library(ggplot2)
> library(ggpubr)
> library(ggridges)
> library(viridis)
> fig7A = peakN %>% ggplot(aes(x = Histone, y = peakN, fill = Histone)) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
facet_grid(~peakType) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ggpubr::rotate_x_text(angle = 30) +
ylab("Number of Peaks") +
xlab("")
> fig7B = peakWidth %>% ggplot(aes(x = Histone, y = width, fill = Histone)) +
geom_violin() +
facet_grid(Replicate~peakType) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
scale_y_continuous(trans = "log", breaks = c(400, 3000, 22000)) +
theme_bw(base_size = 18) +
ggpubr::rotate_x_text(angle = 30) +
ylab("Width of Peaks") +
xlab("")
> fig7C = peakReprod %>% ggplot(aes(x = Histone, y = peakReprodRate, fill = Histone, label = round(peakReprodRate, 2))) +
geom_bar(stat = "identity") +
geom_text(vjust = 0.1) +
facet_grid(Replicate~peakType) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ggpubr::rotate_x_text(angle = 30) +
ylab("% of Peaks Reproduced") +
xlab("")
> fig7D = frip %>% ggplot(aes(x = Histone, y = FRiPs, fill = Histone, label = round(FRiPs, 2))) +
geom_boxplot() +
geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.55, option = "magma", alpha = 0.8) +
scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
theme_bw(base_size = 18) +
ylab("% of Fragments in Peaks") +
xlab("")
> ggarrange(fig7A, fig7B, fig7C, fig7D, ncol = 2, nrow=2, common.legend = TRUE, legend="bottom")