# 1. 初步使用
- 运行下面几行代码,初步了解ChIPpeakAnno使用
library(ChIPpeakAnno)
## import the MACS output
macs <- system.file("extdata", "MACS_peaks.xls", package="ChIPpeakAnno")
macsOutput <- toGRanges(macs, format="MACS")
## annotate the peaks with precompiled ensembl annotation
data(TSS.human.GRCh38)
macs.anno <- annotatePeakInBatch(macsOutput, AnnotationData=TSS.human.GRCh38)
## add gene symbols
library(org.Hs.eg.db)
macs.anno <- addGeneIDs(annotatedPeak=macs.anno,
orgAnn="org.Hs.eg.db",
IDs2Add="symbol")
if(interactive()){## annotate the peaks with UCSC annotation
library(GenomicFeatures)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
ucsc.hg38.knownGene <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
macs.anno <- annotatePeakInBatch(macsOutput,
AnnotationData=ucsc.hg38.knownGene)
macs.anno <- addGeneIDs(annotatedPeak=macs.anno,
orgAnn="org.Hs.eg.db",
feature_id_type="entrez_id",
IDs2Add="symbol")
}
# 2. 详细使用例子
ChIPpeakAnno使用GRanges 保存peak数据。ChIPpeakAnno内置toGRanges 函数可以将BED, GFF等格式转换为GRanges。
- toGRanges()函数
toGRanges(data, format=c("BED", "GFF", "GTF",
"MACS", "MACS2", "MACS2.broad",
"narrowPeak", "broadPeak",
"others"),
header=FALSE, comment.char="#", colNames=NULL, ...)
bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno")
gr1 <- toGRanges(bed, format="BED", header=FALSE)
## one can also try import from rtracklayer
library(rtracklayer)
gr1.import <- import(bed, format="BED")
identical(start(gr1), start(gr1.import))
# [1] TRUE
gr1[1:2]
GRanges object with 2 ranges and 1 metadata column:
seqnames ranges strand | score
<Rle> <IRanges> <Rle> | <numeric>
MACS_peak_1 chr1 28341-29610 * | 160.81
MACS_peak_2 chr1 90821-91234 * | 133.12
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
gr1.import[1:2] #note the name slot is different from gr1
GRanges object with 2 ranges and 2 metadata columns:
seqnames ranges strand | name score
<Rle> <IRanges> <Rle> | <character> <numeric>
[1] chr1 28341-29610 * | MACS_peak_1 160.81
[2] chr1 90821-91234 * | MACS_peak_2 133.12
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno")
gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3)
ol <- findOverlapsOfPeaks(gr1, gr2)
- 韦恩图展示overlap结果
makeVennDiagram(ol,
fill=c("#009E73", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2"), #circle border color
cat.col=c("#D55E00", "#0072B2"))
image.png
- 饼图展示结果
pie1(table(ol$overlappingPeaks[["gr1///gr2"]]$overlapFeature))
image.png
- 找到overlap peak后,可以使用annotatePeakInBatch在maxgap使用AnnotationData中的基因组特征对peak进行注释,在以下示例maxgap=5kb。
overlaps <- ol$peaklist[["gr1///gr2"]]
## ============== old style ===========
## data(TSS.human.GRCh37)
## overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData,
## output="overlapping", maxgap=5000L)
## overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", "symbol")
## ============== new style ===========
library(EnsDb.Hsapiens.v75) ##(hg19)
## create annotation file from EnsDb or TxDb
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
annoData[1:2]
GRanges object with 2 ranges and 1 metadata column:
seqnames ranges strand | gene_name
<Rle> <IRanges> <Rle> | <character>
ENSG00000223972 chr1 11869-14412 + | DDX11L1
ENSG00000227232 chr1 14363-29806 - | WASH7P
-------
seqinfo: 273 sequences from GRCh37 genome
- annotatePeakInBatch进行注释
获取距离最近的TSS、miRNA、外显子等的距离
overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData,
output="overlapping", maxgap=5000L)
overlaps.anno$gene_name <-
annoData$gene_name[match(overlaps.anno$feature,
names(annoData))]
head(overlaps.anno)
GRanges object with 6 ranges and 11 metadata columns:
seqnames ranges strand |
<Rle> <IRanges> <Rle> |
X001.ENSG00000228327 chr1 713791-715578 * |
X001.ENSG00000237491 chr1 713791-715578 * |
X002.ENSG00000237491 chr1 724851-727191 * |
X003.ENSG00000272438 chr1 839467-840090 * |
X004.ENSG00000223764 chr1 856361-856999 * |
X004.ENSG00000187634 chr1 856361-856999 * |
peakNames
<CharacterList>
X001.ENSG00000228327 gr1__MACS_peak_13,gr2__001,gr2__002
X001.ENSG00000237491 gr1__MACS_peak_13,gr2__001,gr2__002
X002.ENSG00000237491 gr2__003,gr1__MACS_peak_14
X003.ENSG00000272438 gr1__MACS_peak_16,gr2__004
X004.ENSG00000223764 gr1__MACS_peak_17,gr2__005
X004.ENSG00000187634 gr1__MACS_peak_17,gr2__005
peak feature
<character> <character>
X001.ENSG00000228327 001 ENSG00000228327
X001.ENSG00000237491 001 ENSG00000237491
X002.ENSG00000237491 002 ENSG00000237491
X003.ENSG00000272438 003 ENSG00000272438
X004.ENSG00000223764 004 ENSG00000223764
X004.ENSG00000187634 004 ENSG00000187634
start_position end_position
<integer> <integer>
X001.ENSG00000228327 700237 714006
X001.ENSG00000237491 714150 745440
X002.ENSG00000237491 714150 745440
X003.ENSG00000272438 840214 851356
X004.ENSG00000223764 852245 856396
X004.ENSG00000187634 860260 879955
feature_strand insideFeature
<character> <character>
X001.ENSG00000228327 - overlapStart
X001.ENSG00000237491 + overlapStart
X002.ENSG00000237491 + inside
X003.ENSG00000272438 + upstream
X004.ENSG00000223764 - overlapStart
X004.ENSG00000187634 + upstream
distancetoFeature shortestDistance
<numeric> <integer>
X001.ENSG00000228327 215 215
X001.ENSG00000237491 -359 359
X002.ENSG00000237491 10701 10701
X003.ENSG00000272438 -747 124
X004.ENSG00000223764 35 35
X004.ENSG00000187634 -3899 3261
fromOverlappingOrNearest gene_name
<character> <character>
X001.ENSG00000228327 Overlapping RP11-206L10.2
X001.ENSG00000237491 Overlapping RP11-206L10.9
X002.ENSG00000237491 Overlapping RP11-206L10.9
X003.ENSG00000272438 Overlapping RP11-54O7.16
X004.ENSG00000223764 Overlapping RP11-54O7.3
X004.ENSG00000187634 Overlapping SAMD11
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
- 绘制peak与最近特征(如转录起始位点TSS)的距离分布。
gr1.copy <- gr1
gr1.copy$score <- 1
binOverFeature(gr1, gr1.copy, annotationData=annoData,
radius=5000, nbins=10, FUN=c(sum, length),
ylab=c("score", "count"),
main=c("Distribution of aggregated peak scores around TSS",
"Distribution of aggregated peak numbers around TSS"))
image.png
- assignChromosomeRegion()可以统计 exon, intron, enhancer, proximal promoter, 5’ UTR 和3’ UTR在peak中的分布。
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
aCR<-assignChromosomeRegion(gr1, nucleotideLevel=FALSE,
precedence=c("Promoters", "immediateDownstream",
"fiveUTRs", "threeUTRs",
"Exons", "Introns"),
TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
barplot(aCR$percentage)
}
image.png
# 3. ChIPpeakAnno 各种运用场景
- loading called peaks from BED, GFF, or other formats;
- evaluating and visualizing the concordance among the biological replicates;
- combining peaks from replicates; preparing genomic annotation(s) as GRanges;
- associating/annotating peaks with the annotation(s); summarizing peak distributions over exon, intron, enhancer, proximal promoter, 5’UTR and 3’UTR regions;
- retrieving the sequences around the peaks; and enrichment analysis of GO and biological pathway.
- plot the heatmap of given peak ranges
- perform permutation test to determine if there is a significant overlap between two sets of peaks.
## 3. 计算不同peak数据的overlap,并使用韦恩图展示
- 通过寻找重复实验peak的overlap来评估实验之间的peak的一致性。也可以通过这种不同实验之间peak一致性来判断两个实验是不是存在一定的联系。
peaks1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6",
"2", "6", "6", "6", "6", "5"),
ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869,
3123260, 3857501, 201089, 1543200,
1557200, 1563000, 1569800, 167889600),
end= c(967754, 2010997, 2496804, 3075969,
3123360, 3857601, 201089, 1555199,
1560599, 1565199, 1573799, 167893599),
names=paste("Site", 1:12, sep="")),
strand="+")
peaks2 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", "3",
"4", "5", "6", "6", "6", "6", "6", "5"),
ranges=IRanges(start=c(967659, 2010898, 2496700,
3075866, 3123260, 3857500,
96765, 201089, 249670, 307586,
312326, 385750, 1549800,
1554400, 1565000, 1569400,
167888600),
end=c(967869, 2011108, 2496920,
3076166,3123470, 3857780,
96985, 201299, 249890, 307796,
312586, 385960, 1550599, 1560799,
1565399, 1571199, 167888999),
names=paste("t", 1:17, sep="")),
strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-",
"-", "-", "-", "+", "+", "+", "+", "+"))
ol <- findOverlapsOfPeaks(peaks1, peaks2, maxgap=1000)
peaklist <- ol$peaklist
attributes(ol )
$names
[1] "venn_cnt" "peaklist" "uniquePeaks"
[4] "mergedPeaks" "peaksInMergedPeaks" "overlappingPeaks"
[7] "all.peaks"
$class
[1] "overlappingPeaks"
olpeaks1///peaks2
中存放peak1和peak2的overlap peak
overlappingPeaks <- ol$overlappingPeaks
overlappingPeaks[["peaks1///peaks2"]][1:2, ]
## peaks1 seqnames start end width strand peaks2 seqnames start
## Site1_t1 Site1 1 967654 967754 101 + t1 1 967659
## Site2_t2 Site2 2 2010897 2010997 101 + t2 2 2010898
## end width strand overlapFeature shortestDistance
## Site1_t1 967869 211 + overlapStart 5
## Site2_t2 2011108 211 + overlapStart 1
pie1(table(overlappingPeaks[["peaks1///peaks2"]]$overlapFeature))
image.png
- 获取merged overlapping peaks
peaklist[["peaks1///peaks2"]]
## GRanges object with 11 ranges and 1 metadata column:
## seqnames ranges strand |
## <Rle> <IRanges> <Rle> |
## [1] 1 967654-967869 + |
## [2] 2 201089-201299 * |
## [3] 2 2010897-2011108 + |
## [4] 3 2496700-2496920 + |
## [5] 4 3075866-3076166 + |
## [6] 5 3123260-3123470 + |
## [7] 5 167888600-167893599 + |
## [8] 6 1543200-1560799 + |
## [9] 6 1563000-1565399 + |
## [10] 6 1569400-1573799 + |
## [11] 6 3857500-3857780 + |
## peakNames
## <CharacterList>
## [1] peaks1__Site1,peaks2__t1
## [2] peaks1__Site7,peaks2__t8
## [3] peaks1__Site2,peaks2__t2
## [4] peaks2__t3,peaks1__Site3
## [5] peaks2__t4,peaks1__Site4
## [6] peaks1__Site5,peaks2__t5
## [7] peaks2__t17,peaks1__Site12
## [8] peaks1__Site8,peaks2__t13,peaks2__t14,...
## [9] peaks1__Site10,peaks2__t15
## [10] peaks2__t16,peaks1__Site11
## [11] peaks2__t6,peaks1__Site6
## -------
## seqinfo: 6 sequences from an unspecified genome; no seqlengths
- peaks1 中宇peaks2没有overlap的peak
peaklist[["peaks1"]]
## NULL
- peaks2 中宇peaks1没有overlap的peak
peaklist[["peaks2"]]
## GRanges object with 5 ranges and 1 metadata column:
## seqnames ranges strand | peakNames
## <Rle> <IRanges> <Rle> | <CharacterList>
## [1] 1 96765-96985 - | peaks2__t7
## [2] 3 249670-249890 - | peaks2__t9
## [3] 4 307586-307796 - | peaks2__t10
## [4] 5 312326-312586 - | peaks2__t11
## [5] 6 385750-385960 - | peaks2__t12
## -------
## seqinfo: 6 sequences from an unspecified genome; no seqlengths
- 韦恩图
并且输出overlapping 是否显著的p值
makeVennDiagram(ol, totalTest=1e+2,
fill=c("#009E73", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2"), #circle border color
cat.col=c("#D55E00", "#0072B2"))
$p.value
peaks1 peaks2 pval
[1,] 1 1 5.890971e-12
$vennCounts
peaks1 peaks2 Counts count.peaks1 count.peaks2
[1,] 0 0 83 0 0
[2,] 0 1 5 0 5
[3,] 1 0 0 0 0
[4,] 1 1 12 12 12
attr(,"class")
[1] "VennCounts"
image.png
- 使用R 包Vernerable绘制韦恩图
# install.packages("Vennerable", repos="http://R-Forge.R-project.org",
# type="source")
# library(Vennerable)
# venn_cnt2venn <- function(venn_cnt){
# n <- which(colnames(venn_cnt)=="Counts") - 1
# SetNames=colnames(venn_cnt)[1:n]
# Weight=venn_cnt[,"Counts"]
# names(Weight) <- apply(venn_cnt[,1:n], 1, base::paste, collapse="")
# Venn(SetNames=SetNames, Weight=Weight)
# }
#
# v <- venn_cnt2venn(ol$venn_cnt)
# plot(v)
- findOverlapsOfPeaks 最多可以计算5 peak lists 的overlapping peak
peaks3 <- GRanges(seqnames=c("1", "2", "3", "4", "5",
"6", "1", "2", "3", "4"),
ranges=IRanges(start=c(967859, 2010868, 2496500, 3075966,
3123460, 3851500, 96865, 201189,
249600, 307386),
end= c(967969, 2011908, 2496720, 3076166,
3123470, 3857680, 96985, 201299,
249890, 307796),
names=paste("p", 1:10, sep="")),
strand=c("+", "+", "+", "+", "+",
"+", "-", "-", "-", "-"))
ol <- findOverlapsOfPeaks(peaks1, peaks2, peaks3, maxgap=1000,
connectedPeaks="min")
makeVennDiagram(ol, totalTest=1e+2,
fill=c("#CC79A7", "#56B4E9", "#F0E442"), # circle fill color
col=c("#D55E00", "#0072B2", "#E69F00"), #circle border color
cat.col=c("#D55E00", "#0072B2", "#E69F00"))
image.png
函数makeVennDiagram中的参数totalTest表示超几何检验中使用的潜在峰值总数。它应该大于最大peak数。设置得越小,测试越严格。用于计算p值的时间并不取决于totalTest的值。超几何检验要求用户输入给定TF的总潜在结合位点(峰值)的估计值。为了规避这一要求,ChIPpeakAnno 实现了一个名为permTest的置换检验。
## 生成注释数据
- peak一半需要注释到基因组元件,ChIPpeakAnno 内置有TSS.human.NCBI36, TSS.human.GRCh37, TSS.human.GRCh38, TSS.mouse.NCBIM37, TSS.mouse.GRCm38, TSS.rat.RGSC3.4, TSS.rat.Rnor_5.0, TSS.zebrafish.Zv8, and TSS.zebrafish.Zv9。使用data()即可调用,data(TSS.human.GRCh38)。对于其他的注释信息,可以使用getAnnotation()函数设置参数featureType,“Exon” to obtain the nearest exon, “miRNA” to find the nearest miRNA, and “5utr” or “3utr” to locate the overlapping “5’UTR” or “3’UTR”. 更多的参数与biomaRt类似。
类似使用TranscriptDb TxDb.Hsapiens.UCSC.hg19.knownGene 和toGRanges可以构建注释文件。
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature="gene")
annoData
## GRanges object with 23056 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## 1 chr19 58858172-58874214 -
## 10 chr8 18248755-18258723 +
## 100 chr20 43248163-43280376 -
## 1000 chr18 25530930-25757445 -
## 10000 chr1 243651535-244006886 -
## ... ... ... ...
## 9991 chr9 114979995-115095944 -
## 9992 chr21 35736323-35743440 +
## 9993 chr22 19023795-19109967 -
## 9994 chr6 90539619-90584155 +
## 9997 chr22 50961997-50964905 -
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
## 注释
data(myPeakList)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6],
AnnotationData = annoData)
annotatedPeak[1:3]
## GRanges object with 3 ranges and 9 metadata columns:
## seqnames ranges strand | peak
## <Rle> <IRanges> <Rle> | <character>
## X1_93_556427.100288069 chr1 556660-556760 * | X1_93_556427
## X1_41_559455.100288069 chr1 559774-559874 * | X1_41_559455
## X1_12_703729.100288069 chr1 703885-703985 * | X1_12_703729
## feature start_position end_position feature_strand
## <character> <integer> <integer> <character>
## X1_93_556427.100288069 100288069 700245 714068 -
## X1_41_559455.100288069 100288069 700245 714068 -
## X1_12_703729.100288069 100288069 700245 714068 -
## insideFeature distancetoFeature shortestDistance
## <character> <numeric> <integer>
## X1_93_556427.100288069 downstream 157408 143485
## X1_41_559455.100288069 downstream 154294 140371
## X1_12_703729.100288069 inside 10183 3640
## fromOverlappingOrNearest
## <character>
## X1_93_556427.100288069 NearestLocation
## X1_41_559455.100288069 NearestLocation
## X1_12_703729.100288069 NearestLocation
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
- 使用自定义的注释文件
myPeak1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6",
"2", "6", "6", "6", "6", "5"),
ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869,
3123260, 3857501, 201089, 1543200,
1557200, 1563000, 1569800, 167889600),
end= c(967754, 2010997, 2496804, 3075969,
3123360, 3857601, 201089, 1555199,
1560599, 1565199, 1573799, 167893599),
names=paste("Site", 1:12, sep="")))
TFbindingSites <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2",
"3", "4", "5", "6", "6", "6", "6", "6",
"5"),
ranges=IRanges(start=c(967659, 2010898, 2496700,
3075866, 3123260, 3857500,
96765, 201089, 249670, 307586,
312326, 385750, 1549800,
1554400, 1565000, 1569400,
167888600),
end=c(967869, 2011108, 2496920,
3076166,3123470, 3857780,
96985, 201299, 249890, 307796,
312586, 385960, 1550599, 1560799,
1565399, 1571199, 167888999),
names=paste("t", 1:17, sep="")),
strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-",
"-", "-", "-", "+", "+", "+", "+", "+"))
annotatedPeak2 <- annotatePeakInBatch(myPeak1, AnnotationData=TFbindingSites)
annotatedPeak2[1:3]
## GRanges object with 3 ranges and 9 metadata columns:
## seqnames ranges strand | peak feature
## <Rle> <IRanges> <Rle> | <character> <character>
## Site1.t1 1 967654-967754 * | Site1 t1
## Site2.t2 2 2010897-2010997 * | Site2 t2
## Site3.t3 3 2496704-2496804 * | Site3 t3
## start_position end_position feature_strand insideFeature
## <integer> <integer> <character> <character>
## Site1.t1 967659 967869 + overlapStart
## Site2.t2 2010898 2011108 + overlapStart
## Site3.t3 2496700 2496920 + inside
## distancetoFeature shortestDistance fromOverlappingOrNearest
## <numeric> <integer> <character>
## Site1.t1 -5 5 NearestLocation
## Site2.t2 -1 1 NearestLocation
## Site3.t3 4 4 NearestLocation
## -------
## seqinfo: 6 sequences from an unspecified genome; no seqlengths
- 饼图展示
pie1(table(as.data.frame(annotatedPeak2)$insideFeature))
image.png
- GenomicFeatures::promoters可以获取启动子注释
library(ChIPpeakAnno)
data(myPeakList)
data(TSS.human.NCBI36)
annotationData <- promoters(TSS.human.NCBI36, upstream=5000, downstream=500)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6,],
AnnotationData=annotationData,
output="overlapping")
annotatedPeak[1:3]
## GRanges object with 3 ranges and 9 metadata columns:
## seqnames ranges strand | peak
## <Rle> <IRanges> <Rle> | <character>
## X1_93_556427.ENSG00000209341 chr1 556660-556760 * | X1_93_556427
## X1_93_556427.ENSG00000209344 chr1 556660-556760 * | X1_93_556427
## X1_93_556427.ENSG00000209346 chr1 556660-556760 * | X1_93_556427
## feature start_position end_position
## <character> <integer> <integer>
## X1_93_556427.ENSG00000209341 ENSG00000209341 554314 559813
## X1_93_556427.ENSG00000209344 ENSG00000209344 555569 561068
## X1_93_556427.ENSG00000209346 ENSG00000209346 555643 561142
## feature_strand insideFeature distancetoFeature
## <character> <character> <numeric>
## X1_93_556427.ENSG00000209341 - inside 3153
## X1_93_556427.ENSG00000209344 - inside 4408
## X1_93_556427.ENSG00000209346 - inside 4482
## shortestDistance fromOverlappingOrNearest
## <integer> <character>
## X1_93_556427.ENSG00000209341 2346 Overlapping
## X1_93_556427.ENSG00000209344 1091 Overlapping
## X1_93_556427.ENSG00000209346 1017 Overlapping
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
- 除了将peak注释到最近的基因之外,ChIPpeakAnno还可以通过在注释PeakInBatch中设置output=“both”和maxgap来报告所有overlap和侧翼基因。例如,如果设置maxgap=5000和output=“both”,它输出5kb内所有重叠和侧翼基因,再加上最近的基因。
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6],
AnnotationData = annoData,
output="both", maxgap=5000)
head(annotatedPeak)
## GRanges object with 6 ranges and 9 metadata columns:
## seqnames ranges strand | peak
## <Rle> <IRanges> <Rle> | <character>
## X1_93_556427.100288069 chr1 556660-556760 * | X1_93_556427
## X1_41_559455.100288069 chr1 559774-559874 * | X1_41_559455
## X1_12_703729.100288069 chr1 703885-703985 * | X1_12_703729
## X1_20_925025.84808 chr1 926058-926158 * | X1_20_925025
## X1_11_1041174.54991 chr1 1041646-1041746 * | X1_11_1041174
## X1_14_1269014.1855 chr1 1270239-1270339 * | X1_14_1269014
## feature start_position end_position feature_strand
## <character> <integer> <integer> <character>
## X1_93_556427.100288069 100288069 700245 714068 -
## X1_41_559455.100288069 100288069 700245 714068 -
## X1_12_703729.100288069 100288069 700245 714068 -
## X1_20_925025.84808 84808 910579 917473 -
## X1_11_1041174.54991 54991 1017198 1051736 -
## X1_14_1269014.1855 1855 1270658 1284492 -
## insideFeature distancetoFeature shortestDistance
## <character> <numeric> <integer>
## X1_93_556427.100288069 downstream 157408 143485
## X1_41_559455.100288069 downstream 154294 140371
## X1_12_703729.100288069 inside 10183 3640
## X1_20_925025.84808 upstream -8585 8585
## X1_11_1041174.54991 inside 10090 9990
## X1_14_1269014.1855 downstream 14253 319
## fromOverlappingOrNearest
## <character>
## X1_93_556427.100288069 NearestLocation
## X1_41_559455.100288069 NearestLocation
## X1_12_703729.100288069 NearestLocation
## X1_20_925025.84808 NearestLocation
## X1_11_1041174.54991 NearestLocation
## X1_14_1269014.1855 Overlapping
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
## Add other feature IDs to the annotated peaks
data(annotatedPeak)
library(org.Hs.eg.db)
addGeneIDs(annotatedPeak[1:6], orgAnn="org.Hs.eg.db", IDs2Add=c("symbol"))
## GRanges object with 6 ranges and 9 metadata columns:
## seqnames ranges strand |
## <Rle> <IRanges> <Rle> |
## X1_11_100272487.ENSG00000202254 1 100272801-100272900 + |
## X1_11_108905539.ENSG00000186086 1 108906026-108906125 + |
## X1_11_110106925.ENSG00000065135 1 110107267-110107366 + |
## X1_11_110679983.ENSG00000197106 1 110680469-110680568 + |
## X1_11_110681677.ENSG00000197106 1 110682125-110682224 + |
## X1_11_110756560.ENSG00000116396 1 110756823-110756922 + |
## peak feature start_position
## <character> <character> <numeric>
## X1_11_100272487.ENSG00000202254 1_11_100272487 ENSG00000202254 100257218
## X1_11_108905539.ENSG00000186086 1_11_108905539 ENSG00000186086 108918435
## X1_11_110106925.ENSG00000065135 1_11_110106925 ENSG00000065135 110091233
## X1_11_110679983.ENSG00000197106 1_11_110679983 ENSG00000197106 110693108
## X1_11_110681677.ENSG00000197106 1_11_110681677 ENSG00000197106 110693108
## X1_11_110756560.ENSG00000116396 1_11_110756560 ENSG00000116396 110753965
## end_position insideFeature distancetoFeature
## <numeric> <character> <numeric>
## X1_11_100272487.ENSG00000202254 100257309 downstream 15582
## X1_11_108905539.ENSG00000186086 109013624 upstream -12410
## X1_11_110106925.ENSG00000065135 110136975 inside 16033
## X1_11_110679983.ENSG00000197106 110744824 upstream -12640
## X1_11_110681677.ENSG00000197106 110744824 upstream -10984
## X1_11_110756560.ENSG00000116396 110776666 inside 2857
## shortestDistance fromOverlappingOrNearest
## <numeric> <character>
## X1_11_100272487.ENSG00000202254 15491 NearestStart
## X1_11_108905539.ENSG00000186086 12310 NearestStart
## X1_11_110106925.ENSG00000065135 16033 NearestStart
## X1_11_110679983.ENSG00000197106 12540 NearestStart
## X1_11_110681677.ENSG00000197106 10884 NearestStart
## X1_11_110756560.ENSG00000116396 2857 NearestStart
## symbol
## <character>
## X1_11_100272487.ENSG00000202254 <NA>
## X1_11_108905539.ENSG00000186086 NBPF6
## X1_11_110106925.ENSG00000065135 GNAI3
## X1_11_110679983.ENSG00000197106 SLC6A17
## X1_11_110681677.ENSG00000197106 SLC6A17
## X1_11_110756560.ENSG00000116396 KCNC4
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
addGeneIDs(annotatedPeak$feature[1:6], orgAnn="org.Hs.eg.db",
IDs2Add=c("symbol"))
## ensembl_gene_id symbol
## 1 ENSG00000065135 GNAI3
## 2 ENSG00000116396 KCNC4
## 3 ENSG00000197106 SLC6A17
## 4 ENSG00000186086 NBPF6
## 5 ENSG00000202254 <NA>
## 获取peak附近的序列
peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"),
ranges=IRanges(start=c(100, 500),
end=c(300, 600),
names=c("peak1", "peak2")))
library(BSgenome.Ecoli.NCBI.20080805)
peaksWithSequences <- getAllPeakSequence(peaks, upstream=20,
downstream=20, genome=Ecoli)
保存为fa格式
write2FASTA(peaksWithSequences,"test.fa")
## Create heatmap for given feature/peak ranges
path <- system.file("extdata", package="ChIPpeakAnno")
files <- dir(path, "broadPeak")
data <- sapply(file.path(path, files), toGRanges, format="broadPeak")
names(data) <- gsub(".broadPeak", "", files)
ol <- findOverlapsOfPeaks(data)
#makeVennDiagram(ol)
features <- ol$peaklist[[length(ol$peaklist)]]
wid <- width(features)
feature.recentered <- feature.center <- features
start(feature.center) <- start(features) + floor(wid/2)
width(feature.center) <- 1
start(feature.recentered) <- start(feature.center) - 2000
end(feature.recentered) <- end(feature.center) + 2000
## here we also suggest importData function in bioconductor trackViewer package
## to import the coverage.
## compare rtracklayer, it will save you time when handle huge dataset.
library(rtracklayer)
files <- dir(path, "bigWig")
if(.Platform$OS.type != "windows"){
cvglists <- sapply(file.path(path, files), import,
format="BigWig",
which=feature.recentered,
as="RleList")
}else{## rtracklayer can not import bigWig files on Windows
load(file.path(path, "cvglist.rds"))
}
names(cvglists) <- gsub(".bigWig", "", files)
sig <- featureAlignedSignal(cvglists, feature.center,
upstream=2000, downstream=2000)
heatmap <- featureAlignedHeatmap(sig, feature.center,
upstream=2000, downstream=2000,
upper.extreme=c(3,.5,4))
image.png
featureAlignedDistribution(sig, feature.center,
upstream=2000, downstream=2000,
type="l")
image.png
## peak中motif 研究
peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"),
ranges=IRanges(start=c(100, 500),
end=c(300, 600),
names=c("peak1", "peak2")))
filepath <- system.file("extdata", "examplePattern.fa", package="ChIPpeakAnno")
readLines(filepath)
## [1] ">ExamplePattern" "GGNCCK" ">ExamplePattern" "AACCNM"
library(BSgenome.Ecoli.NCBI.20080805)
summarizePatternInPeaks(patternFilePath=filepath, format="fasta", skip=0L,
BSgenomeName=Ecoli, peaks=peaks)
## peak注释的基因进行GO富集分析
library(org.Hs.eg.db)
over <- getEnrichedGO(annotatedPeak[1:500], orgAnn="org.Hs.eg.db",
maxP=0.01, minGOterm=10,
multiAdjMethod="BH",
condense=FALSE)
head(over[["bp"]][, -3])
## [1] go.id go.term Ontology
## [4] pvalue count.InDataset count.InGenome
## [7] totaltermInDataset totaltermInGenome BH.adjusted.p.value
## [10] EntrezID
## <0 rows> (or 0-length row.names)
head(over[["cc"]][, -3])
## [1] go.id go.term Ontology
## [4] pvalue count.InDataset count.InGenome
## [7] totaltermInDataset totaltermInGenome BH.adjusted.p.value
## [10] EntrezID
## <0 rows> (or 0-length row.names)
head(over[["mf"]][, -3])
egOrgMap("Mus musculus")
## [1] "org.Mm.eg.db"
egOrgMap("Homo sapiens")
## [1] "org.Hs.eg.db"
## Find peaks with bi-directional promoters
- 双向启动子是位于两个相邻基因的转录起始位点(TSS)之间的DNA区域,它们在相反方向上转录,通常由该共享启动子区域共同调控。下面是一个示例,用于查找双向启动子附近的peak,并输出双向启动子周围peak的百分比。
data(myPeakList)
data(TSS.human.NCBI36)
seqlevelsStyle(TSS.human.NCBI36) <- seqlevelsStyle(myPeakList)
annotatedBDP <- peaksNearBDP(myPeakList[1:10,],
AnnotationData=TSS.human.NCBI36,
MaxDistance=5000)
annotatedBDP$peaksWithBDP
## GRangesList object of length 2:
## $`1`
## GRanges object with 2 ranges and 9 metadata columns:
## seqnames ranges strand | bdp_idx peak
## <Rle> <IRanges> <Rle> | <integer> <character>
## X1_93_556427 chr1 556660-556760 * | 1 X1_93_556427
## X1_93_556427 chr1 556660-556760 * | 1 X1_93_556427
## feature feature.ranges feature.strand distance
## <character> <IRanges> <Rle> <integer>
## X1_93_556427 ENSG00000209349 556240-556304 - 355
## X1_93_556427 ENSG00000209351 557933-557999 + 1172
## insideFeature distanceToSite description
## <character> <integer> <character>
## X1_93_556427 upstream 355
## X1_93_556427 upstream 1172
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
##
## $`8`
## GRanges object with 2 ranges and 9 metadata columns:
## seqnames ranges strand | bdp_idx peak
## <Rle> <IRanges> <Rle> | <integer> <character>
## X1_14_1300250 chr1 1300503-1300603 * | 8 X1_14_1300250
## X1_14_1300250 chr1 1300503-1300603 * | 8 X1_14_1300250
## feature feature.ranges feature.strand distance
## <character> <IRanges> <Rle> <integer>
## X1_14_1300250 ENSG00000175756 1298974-1300443 - 59
## X1_14_1300250 ENSG00000218550 1303908-1304275 + 3304
## insideFeature distanceToSite description
## <character> <integer> <character>
## X1_14_1300250 upstream 59 Aurora kinase A-inte..
## X1_14_1300250 upstream 3304
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
c(annotatedBDP$percentPeaksWithBDP,
annotatedBDP$n.peaks,
annotatedBDP$n.peaksWithBDP)
## [1] 0.2 10.0 2.0
## 置换检验置换测试计算两组peak之间的overlap的显著性
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
cds <- unique(unlist(cdsBy(txdb)))
utr5 <- unique(unlist(fiveUTRsByTranscript(txdb)))
utr3 <- unique(unlist(threeUTRsByTranscript(txdb)))
set.seed(123)
utr3 <- utr3[sample.int(length(utr3), 1000)]
pt <- peakPermTest(utr3,
utr5[sample.int(length(utr5), 1000)],
maxgap=500,
TxDb=txdb, seed=1,
force.parallel=FALSE)
plot(pt)
## highly relevant peaks
ol <- findOverlaps(cds, utr3, maxgap=1)
pt1 <- peakPermTest(utr3,
c(cds[sample.int(length(cds), 500)],
cds[queryHits(ol)][sample.int(length(ol), 500)]),
maxgap=500,
TxDb=txdb, seed=1,
force.parallel=FALSE)
plot(pt1)
}
if(interactive()){
data(HOT.spots)
data(wgEncodeTfbsV3)
hotGR <- reduce(unlist(HOT.spots))
removeOl <- function(.ele){
ol <- findOverlaps(.ele, hotGR)
if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))]
.ele
}
temp <- tempfile()
download.file(file.path("http://hgdownload.cse.ucsc.edu",
"goldenPath", "hg19", "encodeDCC",
"wgEncodeRegTfbsClustered",
"wgEncodeRegTfbsClusteredV3.bed.gz"), temp)
data <- toGRanges(gzfile(temp, "r"), header=FALSE, format="others",
colNames = c("seqnames", "start", "end", "TF"))
unlink(temp)
data <- split(data, data$TF)
TAF1 <- removeOl(data[["TAF1"]])
TEAD4 <- removeOl(data[["TEAD4"]])
pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3), N=length(TAF1))
pt <- peakPermTest(TAF1, TEAD4, pool=pool, ntimes=1000)
plot(pt)
}