首先附上软件链接:https://www.bioconductor.org/packages/release/bioc/html/systemPipeR.html
http://www.bioconductor.org/help/course-materials/2015/BioC2015/systemPipeR_Presentation.html
systemPipeR可以系统的,重头到尾的分析NGS数据
目前的Workflow templates有:
1.RNA-Seq Workflow
2.ChIP-Seq Workflow
3.VAR-Seq Workflow
4.Ribo-Seq Workflow
由于交作业是采用Chip-seq,所以这里分享我的收获心得,并且大部分代码都可以在这个包的readme上查到,那么我会对有些代码进行说明介绍和结果的展示
加载包
首先先安装包
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
BiocManager::install("systemPipeRdata")
BiocManager::install("systemPipeR")
BiocManager::install("ChIPseeker")
BiocManager::install("ChIPpeakAnno")
BiocManager::install("seqLogo")
BiocManager::install("BCRANK")
install.packages("tidyr")
install.packages("graphlayouts")
install.packages("ggridges")
library(systemPipeR)
library(systemPipeRdata)
加载数据
然后我是利用包内的示例数据
setwd(choose.dir())
genWorkenvir(workflow = "chipseq")#在工作目录中自动创建了chipseq目录及子目录,并生成数据
setwd("chipseq")
这一步是生成示例数据
数据质检
那么第一步,我们读取测序的fastq文件
targetspath <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
targets <- read.delim(targetspath, comment.char = "#")
targets[1:4, -c(5, 6)]
其中targets_chip.txt是数据信息
下一步就是对测序的reads质量进行修剪
args <- systemArgs(sysma = "param/trim.param", mytargets = "targets_chip.txt")
filterFct <- function(fq, cutoff = 20, Nexceptions = 0) {
qcount <- rowSums(as(quality(fq), "matrix") <= cutoff, na.rm = TRUE)
fq[qcount <= Nexceptions]
# 保留读取 Phred 分数 > = cutoff 为20的 reads,cutoff可以自定义
}
preprocessReads(args = args, Fct = "filterFct(fq, cutoff=20, Nexceptions=0)",
batchsize = 1e+05)
writeTargetsout(x = args, file = "targets_chip_trim.txt", overwrite = TRUE)
这一步相当于做fastqc和过滤
再来就是生成report
args <- systemArgs(sysma = "param/trim.param", mytargets = "targets_chip.txt")
library(BiocParallel)
library(batchtools)
f <- function(x) {
library(systemPipeR)
args <- systemArgs(sysma = "param/trim.param", mytargets = "targets_chip.txt")
seeFastq(fastq = infile1(args)[x], batchsize = 1e+05, klength = 8)
}
resources <- list(walltime = 120, ntasks = 1, ncpus = 4, memory = 1024)
fqlist <- lapply(seq(along = trim), f)
pdf("./results/fastqReport.pdf", height = 18, width = 4 * length(fqlist))
seeFastqPlot(unlist(fqlist, recursive = FALSE))
dev.off()
上图所示结果即为每个fastq文件的质检结果,与fastqc类似
而param/trim.param是一些参数
比对
这一步建议在工作站上使用,过程较为长
rgs <- systemArgs(sysma = "param/bowtieSE.param", mytargets = "targets_chip_trim.txt")
sysargs(args)[1] # 第一个 FASTQ 文件的命令行参数
system("bowtie2-build ./data/tair10.fasta ./data/tair10.fasta")
# 对基因组文件建立索引
resources <- list(walltime = 120, ntasks = 1, ncpus = cores(args),
memory = 1024)
reg <- clusterRun(args, conffile = ".batchtools.conf.R", Njobs = 18,
template = "batchtools.slurm.tmpl", runid = "01", resourceList = resources)
getStatus(reg = reg)
waitForJobs(reg = reg)
writeTargetsout(x = args, file = "targets_bam.txt", overwrite = TRUE)
#检测bam文件是否生成完
file.exists(outpaths(args))
其中param/bowtieSE.param,targets_chip_trim.txt和param/trim.param一样是一些参数,targets_chip_trim.txt与targets_chip.txt一样,只不过是修剪过的
而targets_bam.txt是输出的bam文件信息
接着我们可以统计下mapping的信息
read_statsDF <- alignStats(args = args)
write.table(read_statsDF, "results/alignStats.xls", row.names = FALSE,
quote = FALSE, sep = "\t")
read.delim("results/alignStats.xls")
peak calling
我们用MACS2来call peak
首先是要合并我们比对的bam文件
args <- systemArgs(sysma = NULL, mytargets = "targets_bam.txt")
args_merge <- mergeBamByFactor(args, overwrite = TRUE)
writeTargetsout(x = args_merge, file = "targets_mergeBamByFactor.txt",
overwrite = TRUE)
对于没有参考的call peaks
args <- systemArgs(sysma = "param/macs2_noinput.param", mytargets = "targets_mergeBamByFactor.txt")
sysargs(args)[1]
runCommandline(args)
file.exists(outpaths(args))
writeTargetsout(x = args, file = "targets_macs.txt", overwrite = TRUE)
这是param/macs2_noinput.param的内容
那么有参的情况
writeTargetsRef(infile = "targets_mergeBamByFactor.txt", outfile = "targets_bam_ref.txt",
silent = FALSE, overwrite = TRUE)
args_input <- systemArgs(sysma = "param/macs2.param", mytargets = "targets_bam_ref.txt")
sysargs(args_input)[1]
runCommandline(args_input)
file.exists(outpaths(args_input))
writeTargetsout(x = args_input, file = "targets_macs_input.txt",
overwrite = TRUE)
我们注意观察到,无参和有参的区别是有没有这一句
writeTargetsRef(infile = "targets_mergeBamByFactor.txt", outfile = "targets_bam_ref.txt",
silent = FALSE, overwrite = TRUE)
有参的利用比对好的bam文件做成有参的bam_ref文件
第二个区别就是参数区别,无参采用param/macs2_noinput.param,有参采用param/macs2.param
利用ChIPpeakAnno注释
library(ChIPpeakAnno)
library(GenomicFeatures)
args <- systemArgs(sysma = "param/annotate_peaks.param", mytargets = "targets_macs.txt")
# txdb <- loadDb('./data/tair10.sqlite')
txdb <- makeTxDbFromGFF(file = "data/tair10.gff", format = "gff",
dataSource = "TAIR", organism = "Arabidopsis thaliana")
ge <- genes(txdb, columns = c("tx_name", "gene_id", "tx_type"))
for (i in seq(along = args)) {
peaksGR <- as(read.delim(infile1(args)[i], comment = "#"),
"GRanges")
annotatedPeak <- annotatePeakInBatch(peaksGR, AnnotationData = genes(txdb))
df <- data.frame(as.data.frame(annotatedPeak), as.data.frame(values(ge[values(annotatedPeak)$feature,
])))
write.table(df, outpaths(args[i]), quote = FALSE, row.names = FALSE,
sep = "\t")
}
writeTargetsout(x = args, file = "targets_peakanno.txt", overwrite = TRUE)
这里的示例数据的注释是 Arabidopsis thaliana GFF文件
利用CHIPseeker注释
library(ChIPseeker)
args <- systemArgs(sysma = "param/annotate_peaks.param", mytargets = "targets_macs.txt")
# txdb <- loadDb('./data/tair10.sqlite')
txdb <- makeTxDbFromGFF(file = "data/tair10.gff", format = "gff",
dataSource = "TAIR", organism = "Arabidopsis thaliana")
ge <- genes(txdb, columns = c("tx_name", "gene_id", "tx_type"))
for (i in seq(along = args)) {
peakAnno <- annotatePeak(infile1(args)[i], TxDb = txdb, verbose = FALSE)
df <- as.data.frame(peakAnno)
write.table(df, outpaths(args[i]), quote = FALSE, row.names = FALSE,
sep = "\t")
}
writeTargetsout(x = args, file = "targets_peakanno.txt", overwrite = TRUE)
#看一下注释情况,此处举个例子
peak <- readPeakFile(infile1(args)[1])
covplot(peak, weightCol = "X.log10.pvalue.")
peakHeatmap(outpaths(args)[1], TxDb = txdb, upstream = 1000,
downstream = 1000, color = "red")
plotAvgProf2(outpaths(args)[1], TxDb = txdb, upstream = 1000,
downstream = 1000, xlab = "Genomic Region (5'->3')", ylab = "Read Count Frequency")
对overlap区段的peak进行计数
library(GenomicRanges)
args <- systemArgs(sysma = "param/count_rangesets.param", mytargets = "targets_macs.txt")
args_bam <- systemArgs(sysma = NULL, mytargets = "targets_bam.txt")
bfl <- BamFileList(outpaths(args_bam), yieldSize = 50000, index = character())
countDFnames <- countRangeset(bfl, args, mode = "Union", ignore.strand = TRUE)
writeTargetsout(x = args, file = "targets_countDF.txt", overwrite = TRUE)
差异peaks分析
这里利用的是rundiff()这个函数,来比较实验组与对照组的差异peaks
args_diff <- systemArgs(sysma = "param/rundiff.param", mytargets = "targets_countDF.txt")
cmp <- readComp(file = args_bam, format = "matrix")
dbrlist <- runDiff(args = args_diff, diffFct = run_edgeR, targets = targetsin(args_bam),
cmp = cmp[[1]], independent = TRUE, dbrfilter = c(Fold = 2,
FDR = 1))
writeTargetsout(x = args_diff, file = "targets_rundiff.txt",
overwrite = TRUE)
注意可在dbrfilter = c(Fold = 2,
FDR = 1)这里设置fold change和FDR
GO富集
对peaks富集的基因做GO功能富集
args <- systemArgs(sysma = "param/macs2.param", mytargets = "targets_bam_ref.txt")
args_anno <- systemArgs(sysma = "param/annotate_peaks.param",
mytargets = "targets_macs.txt")
annofiles <- outpaths(args_anno)
gene_ids <- sapply(names(annofiles), function(x) unique(as.character(read.delim(annofiles[x])[,
"gene_id"])), simplify = FALSE)
load("data/GO/catdb.RData")
BatchResult <- GOCluster_Report(catdb = catdb, setlist = gene_ids,
method = "all", id_type = "gene", CLSZ = 2, cutoff = 0.9,
gocats = c("MF", "BP", "CC"), recordSpecGO = NULL)
Motif 分析
我们的目的是查看蛋白绑定在什么样的DNA区段,以及它有什么样的功能
library(Biostrings)
library(seqLogo)
library(BCRANK)
args <- systemArgs(sysma = "param/annotate_peaks.param", mytargets = "targets_macs.txt")
rangefiles <- infile1(args)
for (i in seq(along = rangefiles)) {
df <- read.delim(rangefiles[i], comment = "#")
peaks <- as(df, "GRanges")
names(peaks) <- paste0(as.character(seqnames(peaks)), "_",
start(peaks), "-", end(peaks))
peaks <- peaks[order(values(peaks)$X.log10.pvalue., decreasing = TRUE)]
pseq <- getSeq(FaFile("./data/tair10.fasta"), peaks)
names(pseq) <- names(peaks)
writeXStringSet(pseq, paste0(rangefiles[i], ".fasta"))
}
#可视化
BCRANKout <- bcrank(paste0(rangefiles[1], ".fasta"), restarts = 25,
use.P1 = TRUE, use.P2 = TRUE)
toptable(BCRANKout)
topMotif <- toptable(BCRANKout, 1)
weightMatrix <- pwm(topMotif, normalize = FALSE)
weightMatrixNormalized <- pwm(topMotif, normalize = TRUE)
pdf("results/seqlogo.pdf")
seqLogo(weightMatrixNormalized)
dev.off()
新版的
当然,升级到R3.6以上,systemPipeR有一些新用法,原理是一样的
Before start, Make sure following packages are already installed:
#options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
#BiocManager::install("systemPipeRdata")
#BiocManager::install("systemPipeR")
#BiocManager::install("ChIPseeker")
#BiocManager::install("ChIPpeakAnno")
#BiocManager::install("seqLogo")
#BiocManager::install("BCRANK")
#install.packages("tidyr")
#install.packages("graphlayouts")
#install.packages("ggridges")
library(systemPipeR)
library(systemPipeRdata)
#####################################################
## 1. Generate workflow environment
#####################################################
setwd(choose.dir())
genWorkenvir(workflow = "chipseq")#在工作目录中自动创建了chipseq目录及子目录,并生成了实验数据
setwd("chipseq")
#####################################################
## 2. Read preprocessing
#####################################################
## 2.1 Experiment definition provided by targets file
## The targets file defines all FASTQ files and sample comparisons of ## the analysis workflow.
targetspath <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
targets <- read.delim(targetspath, comment.char = "#")
targets[1:4, -c(5, 6)]
## 2.2 Read quality filtering and trimming
## The following example shows how one can design a custom read preprocessing
## function using utilities provided by the ShortRead package, and then apply it with
## preprocessReads in batch mode to all FASTQ samples referenced in the
## corresponding SYSargs2 instance (trim object below). More detailed information
## on read preprocessing is provided in systemPipeR's main vignette.
## First, we construct SYSargs2 object from cwl and yml param and targets files.
dir_path <- system.file("extdata/cwl/preprocessReads/trim-se", package = "systemPipeR")
trim <- loadWF(targets = targetspath, wf_file = "trim-se.cwl", input_file = "trim-se.yml", dir_path = dir_path)
trim <- renderWF(trim, inputvars = c(FileName = "_FASTQ_PATH1_", SampleName = "_SampleName_"))
output(trim)[1:2]
## Next, we execute the code for trimming all the raw data.
filterFct <- function(fq, cutoff = 20, Nexceptions = 0) {
qcount <- rowSums(as(quality(fq), "matrix") <= cutoff, na.rm = TRUE)
fq[qcount <= Nexceptions]
# Retains reads where Phred scores are >= cutoff with N
# exceptions
}
preprocessReads(args = trim, Fct = "filterFct(fq, cutoff=20, Nexceptions=0)",
batchsize = 1e+05)
writeTargetsout(x = trim, file = "targets_chip_trim.txt", step = 1,
new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
## 2.3 FASTQ quality report
## The following seeFastq and seeFastqPlot functions generate and plot a series of
## useful quality statistics for a set of FASTQ files including per cycle quality box
## plots, base proportions, base-level quality trends, relative k-mer diversity, length
## and occurrence distribution of reads, number of reads above quality cutoffs and mean
## quality distribution. The results are written to a PDF file named fastqReport.pdf.
## Parallelization of FASTQ quality report via scheduler (e.g. Slurm) across several
## compute nodes.
library(BiocParallel)
library(batchtools)
f <- function(x) {
targets <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
dir_path <- system.file("extdata/cwl/preprocessReads/trim-se",
package = "systemPipeR")
trim <- loadWorkflow(targets = targets, wf_file = "trim-se.cwl",
input_file = "trim-se.yml", dir_path = dir_path)
trim <- renderWF(trim, inputvars = c(FileName = "_FASTQ_PATH1_",
SampleName = "_SampleName_"))
seeFastq(fastq = infile1(trim)[x], batchsize = 1e+05, klength = 8)
}
resources <- list(walltime = 120, ntasks = 1, ncpus = 4, memory = 1024)
fqlist <- lapply(seq(along = trim), f)
pdf("./results/fastqReport.pdf", height = 18, width = 4 * length(fqlist))
seeFastqPlot(unlist(fqlist, recursive = FALSE))
dev.off()
#####################################################
## 3 Alignments
#####################################################
3.1 Read mapping with Bowtie2
#The NGS reads of this project will be aligned with Bowtie2 against the reference
## genome sequence (Langmead and Salzberg 2012). The parameter settings of the
## aligner are defined in the bowtie2-index.cwl and bowtie2-index.yml files. In
## ChIP-Seq experiments it is usually more appropriate to eliminate reads mapping to
## multiple locations. To achieve this, users want to remove the argument setting
## -k 50 non-deterministic in the configuration files.
# Building the index:
dir_path <- system.file("extdata/cwl/bowtie2/bowtie2-idx", package = "systemPipeR")
idx <- loadWorkflow(targets = NULL, wf_file = "bowtie2-index.cwl",
input_file = "bowtie2-index.yml", dir_path = dir_path)
idx <- renderWF(idx)
idx
cmdlist(idx)
# Run in single machine
#runCommandline(idx, make_bam = FALSE) # Skip, This must run on Linux
targets <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
dir_path <- system.file("extdata/cwl/bowtie2/bowtie2-se", package = "systemPipeR")
args <- loadWF(targets = targets, wf_file = "bowtie2-mapping-se.cwl",
input_file = "bowtie2-mapping-se.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_", SampleName = "_SampleName_"))
args
cmdlist(args)[1:2]
run the alignments sequentially on a single system.
args <- runCommandline(args, force = F)
#Check whether all BAM files have been created and write out the new targets file.
writeTargetsout(x = args, file = "targets_bam.txt", step = 1, new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
file.exists(outpaths)
# 3.2 Read and alignment stats
read_statsDF <- alignStats(args = args)
write.table(read_statsDF, "results/alignStats.xls", row.names = FALSE,quote = FALSE, sep = "\t")
read.delim("results/alignStats.xls")
#####################################################
## 4 Peak calling with MACS2 (Skip this step)
#####################################################
# Merging BAM files of technical and/or biological replicates can improve the
# sensitivity of the peak calling by increasing the depth of read coverage. The
# mergeBamByFactor function merges BAM files based on grouping information
# specified by a factor, here the Factor column of the imported targets file. It also
# returns an updated SYSargs2 object containing the paths to the merged BAM files
# as well as to any unmerged files without replicates. This step can be skipped if
# merging of BAM files is not desired.
# 4.1 Merge BAM files of replicates prior to peak calling
dir_path <- system.file("extdata/cwl/mergeBamByFactor", package = "systemPipeR")
args <- loadWF(targets = "targets_bam.txt", wf_file = "merge-bam.cwl",
input_file = "merge-bam.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_BAM_PATH_", SampleName = "_SampleName_"))
args_merge <- mergeBamByFactor(args = args, overwrite = TRUE)
writeTargetsout(x = args_merge, file = "targets_mergeBamByFactor.txt",
step = 1, new_col = "FileName", new_col_output_index = 1,
overwrite = TRUE)
# 4.2 Peak calling without input/reference sample
# MACS2 can perform peak calling on ChIP-Seq data with and without input samples
# (Zhang et al. 2008). The following performs peak calling without input on all samples
# specified in the corresponding args object. Note, due to the small size of the
# sample data, MACS2 needs to be run here with the nomodel setting. For real data
# sets, users want to remove this parameter in the corresponding *.param file(s).
dir_path <- system.file("extdata/cwl/MACS2/MACS2-noinput/", package = "systemPipeR")
args <- loadWF(targets = "targets_mergeBamByFactor.txt", wf_file = "macs2.cwl",
input_file = "macs2.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_", SampleName = "_SampleName_"))
runCommandline(args, make_bam = FALSE, force = T)
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
file.exists(outpaths)
writeTargetsout(x = args, file = "targets_macs.txt", step = 1,
new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
# 4.3 Peak calling with input/reference sample
# To perform peak calling with input samples, they can be most conveniently specified
# in the SampleReference column of the initial targets file. The writeTargetsRef
# function uses this information to create a targets file intermediate for running
# MACS2 with the corresponding input samples.
writeTargetsRef(infile = "targets_mergeBamByFactor.txt", outfile = "targets_bam_ref.txt",
silent = FALSE, overwrite = TRUE)
dir_path <- system.file("extdata/cwl/MACS2/MACS2-input/", package = "systemPipeR")
args_input <- loadWF(targets = "targets_bam_ref.txt", wf_file = "macs2-input.cwl",
input_file = "macs2.yml", dir_path = dir_path)
args_input <- renderWF(args_input, inputvars = c(FileName1 = "_FASTQ_PATH1_",
FileName2 = "_FASTQ_PATH2_", SampleName = "_SampleName_"))
cmdlist(args_input)[1]
## Run
args_input <- runCommandline(args_input, make_bam = FALSE, force = T)
outpaths_input <- subsetWF(args_input, slot = "output", subset = 1,
index = 1)
file.exists(outpaths_input)
writeTargetsout(x = args_input, file = "targets_macs_input.txt",
step = 1, new_col = "FileName", new_col_output_index = 1,
overwrite = TRUE)
# The peak calling results from MACS2 are written for each sample to separate files in
# the results directory. They are named after the corresponding files with extensions
# used by MACS2.
# 4.4 Identify consensus peaks
# The following example shows how one can identify consensus preaks among two
# peak sets sharing either a minimum absolute overlap and/or minimum relative
# overlap using the subsetByOverlaps or olRanges functions, respectively. Note, the
# latter is a custom function imported below by sourcing it.
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1) ## escolher um dos outputs index
peak_M1A <- outpaths["M1A"]
peak_M1A <-as(read.delim(peak_M1A, comment = "#")[, 1:3], "GRanges")
peak_A1A <- outpaths["A1A"]
peak_A1A <- as(read.delim(peak_A1A, comment = "#")[, 1:3], "GRanges")
(myol1 <- subsetByOverlaps(peak_M1A, peak_A1A, minoverlap = 1))
# Returns any overlap
myol2 <- olRanges(query = peak_M1A, subject = peak_A1A, output = "gr")
# Returns any overlap with OL length information
myol2[values(myol2)["OLpercQ"][, 1] >= 50]
# Returns only query peaks with a minimum overlap of 50%
## 5. Annotate peaks with genomic context
#####################################################
## 5.1 Annotation with ChIPpeakAnno package
## The following annotates the identified peaks with genomic context information
## using the ChIPpeakAnno and ChIPseeker packages, respectively (Zhu et al. 2010;
## Yu, Wang, and He 2015).
library(ChIPpeakAnno)
library(GenomicFeatures)
dir_path <- system.file("extdata/cwl/annotate_peaks", package = "systemPipeR")
# 将targets_macs.txt复制到chipseq目录中
args <- loadWF(targets = "targets_macs.txt", wf_file = "annotate-peaks.cwl",
input_file = "annotate-peaks.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_",
SampleName = "_SampleName_"))
txdb <- makeTxDbFromGFF(file = "data/tair10.gff", format = "gff",
dataSource = "TAIR", organism = "Arabidopsis thaliana")
ge <- genes(txdb, columns = c("tx_name", "gene_id", "tx_type"))
# 将Peaks中的文件复制到chipseq/results目录中
for (i in seq(along = args)) {
peaksGR <- as(read.delim(infile1(args)[i], comment = "#"),
"GRanges")
annotatedPeak <- annotatePeakInBatch(peaksGR, AnnotationData = genes(txdb))
df <- data.frame(as.data.frame(annotatedPeak), as.data.frame(values(ge[values(annotatedPeak)$feature,
])))
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
write.table(df, outpaths[i], quote = FALSE, row.names = FALSE,
sep = "\t")
}
writeTargetsout(x = args, file = "targets_peakanno.txt", step = 1,
new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
## The peak annotation results are written for each peak set to separate files in the
## results directory. They are named after the corresponding peak files with
## extensions specified in the annotate_peaks.param file, here *.peaks.annotated.xls.
## 5.2 Annotation with ChIPseeker package
## Same as in previous step but using the ChIPseeker package for annotating the peaks.
library(ChIPseeker)
for (i in seq(along = args)) {
peakAnno <- annotatePeak(infile1(args)[i], TxDb = txdb, verbose = FALSE)
df <- as.data.frame(peakAnno)
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
write.table(df, outpaths[i], quote = FALSE, row.names = FALSE,
sep = "\t")
}
writeTargetsout(x = args, file = "targets_peakanno.txt", step = 1,
new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
# Summary plots provided by the ChIPseeker package. Here applied only to one sample for demonstration purposes.
peak <- readPeakFile(infile1(args)[1])
covplot(peak, weightCol = "X.log10.pvalue.")
outpaths <- subsetWF(args, slot = "output", subset = 1, index = 1)
peakHeatmap(outpaths[1], TxDb = txdb, upstream = 1000, downstream = 1000,
color = "red")
plotAvgProf2(outpaths[1], TxDb = txdb, upstream = 1000, downstream = 1000,
xlab = "Genomic Region (5'->3')", ylab = "Read Count Frequency")
#####################################################
## 6 Count reads overlapping peaks
#####################################################
## The countRangeset function is a convenience wrapper to perform read counting
## iteratively over serveral range sets, here peak range sets. Internally, the read
## counting is performed with the summarizeOverlaps function from the
## GenomicAlignments package. The resulting count tables are directly saved to files,
## one for each peak set.
library(GenomicRanges)
dir_path <- system.file("extdata/cwl/count_rangesets", package = "systemPipeR")
args <- loadWF(targets = "targets_macs.txt", wf_file = "count_rangesets.cwl",
input_file = "count_rangesets.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_",
SampleName = "_SampleName_"))
## Bam Files
targets <- system.file("extdata", "targets_chip.txt", package = "systemPipeR")
dir_path <- system.file("extdata/cwl/bowtie2/bowtie2-se", package = "systemPipeR")
args_bam <- loadWF(targets = targets, wf_file = "bowtie2-mapping-se.cwl",
input_file = "bowtie2-mapping-se.yml", dir_path = dir_path)
args_bam <- renderWF(args_bam, inputvars = c(FileName = "_FASTQ_PATH1_",
SampleName = "_SampleName_"))
args_bam <- output_update(args_bam, dir = FALSE, replace = TRUE,
extension = c(".sam", ".bam"))
outpaths <- subsetWF(args_bam, slot = "output", subset = 1, index = 1)
# 将Bam中的文件复制到chipseq/results目录中
bfl <- BamFileList(outpaths, yieldSize = 50000, index = character())
countDFnames <- countRangeset(bfl, args, mode = "Union", ignore.strand = TRUE)
writeTargetsout(x = args, file = "targets_countDF.txt", step = 1,
new_col = "FileName", new_col_output_index = 1, overwrite = TRUE)
#####################################################
## 7 Differential binding analysis
#####################################################
## The runDiff function performs differential binding analysis in batch mode for
## several count tables using edgeR or DESeq2 (Robinson, McCarthy, and Smyth 2010;
## Love, Huber, and Anders 2014). Internally, it calls the functions run_edgeR and
## run_DESeq2. It also returns the filtering results and plots from the downstream
## filterDEGs function using the fold change and FDR cutoffs provided under the
## dbrfilter argument.
dir_path <- system.file("extdata/cwl/rundiff", package = "systemPipeR")
args_diff <- loadWF(targets = "targets_countDF.txt", wf_file = "rundiff.cwl",
input_file = "rundiff.yml", dir_path = dir_path)
args_diff <- renderWF(args_diff, inputvars = c(FileName = "_FASTQ_PATH1_",
SampleName = "_SampleName_"))
cmp <- readComp(file = args_bam, format = "matrix")
dbrlist <- runDiff(args = args_diff, diffFct = run_edgeR, targets = targets.as.df(targets(args_bam)),
cmp = cmp[[1]], independent = TRUE, dbrfilter = c(Fold = 2,
FDR = 1))
writeTargetsout(x = args_diff, file = "targets_rundiff.txt",
step = 1, new_col = "FileName", new_col_output_index = 1,
overwrite = TRUE)
#####################################################
## 8 GO term enrichment analysis
#####################################################
## The following performs GO term enrichment analysis for each annotated peak set.
dir_path <- system.file("extdata/cwl/annotate_peaks", package = "systemPipeR")
# 将targets_bam_ref.txt复制到chipseq目录中
args <- loadWF(targets = "targets_bam_ref.txt", wf_file = "annotate-peaks.cwl",
input_file = "annotate-peaks.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName1 = "_FASTQ_PATH1_",
FileName2 = "_FASTQ_PATH2_", SampleName = "_SampleName_"))
args_anno <- loadWF(targets = "targets_macs.txt", wf_file = "annotate-peaks.cwl",
input_file = "annotate-peaks.yml", dir_path = dir_path)
args_anno <- renderWF(args_anno, inputvars = c(FileName = "_FASTQ_PATH1_",
SampleName = "_SampleName_"))
annofiles <- subsetWF(args_anno, slot = "output", subset = 1,
index = 1)
gene_ids <- sapply(names(annofiles), function(x) unique(as.character(read.delim(annofiles[x])[,
"geneId"])), simplify = FALSE)
load("data/GO/catdb.RData")
BatchResult <- GOCluster_Report(catdb = catdb, setlist = gene_ids,
method = "all", id_type = "gene", CLSZ = 2, cutoff = 0.9,
gocats = c("MF", "BP", "CC"), recordSpecGO = NULL)
#####################################################
## 9 Motif analysis
#####################################################
## 9.1 Parse DNA sequences of peak regions from genome
## Enrichment analysis of known DNA binding motifs or de novo discovery of novel
## motifs requires the DNA sequences of the identified peak regions. To parse the
## corresponding sequences from the reference genome, the getSeq function from
## the Biostrings package can be used. The following example parses the sequences
## for each peak set and saves the results to separate FASTA files, one for each peak
## set. In addition, the sequences in the FASTA files are ranked (sorted) by increasing ## p-values as expected by some motif discovery tools, such as BCRANK.
library(Biostrings)
library(seqLogo)
library(BCRANK)
dir_path <- system.file("extdata/cwl/annotate_peaks", package = "systemPipeR")
args <- loadWF(targets = "targets_macs.txt", wf_file = "annotate-peaks.cwl",
input_file = "annotate-peaks.yml", dir_path = dir_path)
args <- renderWF(args, inputvars = c(FileName = "_FASTQ_PATH1_",
SampleName = "_SampleName_"))
rangefiles <- infile1(args)
for (i in seq(along = rangefiles)) {
df <- read.delim(rangefiles[i], comment = "#")
peaks <- as(df, "GRanges")
names(peaks) <- paste0(as.character(seqnames(peaks)), "_",
start(peaks), "-", end(peaks))
peaks <- peaks[order(values(peaks)$X.log10.pvalue., decreasing = TRUE)]
pseq <- getSeq(FaFile("./data/tair10.fasta"), peaks)
names(pseq) <- names(peaks)
writeXStringSet(pseq, paste0(rangefiles[i], ".fasta"))
}
## 9.2 Motif discovery with BCRANK
## The Bioconductor package BCRANK is one of the many tools available for de novo
## discovery of DNA binding motifs in peak regions of ChIP-Seq experiments. The given
## example applies this method on the first peak sample set and plots the sequence
## logo of the highest ranking motif.
set.seed(0)
BCRANKout <- bcrank(paste0(rangefiles[1], ".fasta"), restarts = 25,
use.P1 = TRUE, use.P2 = TRUE)
toptable(BCRANKout)
topMotif <- toptable(BCRANKout, 1)
weightMatrix <- pwm(topMotif, normalize = FALSE)
weightMatrixNormalized <- pwm(topMotif, normalize = TRUE)
pdf("results/seqlogo.pdf")
seqLogo(weightMatrixNormalized)
dev.off()