from Jimmy
读入peaks
peak <- readPeakFile("RNAPII_8WG16_peaks.narrowPeak") ##一定要是narrowPeak文件
## loading packages
library(ChIPseeker)
library(clusterProfiler)
there are 26029 peaks for this data
peaks性质
ChIP Peaks over Chromosomes
首先查看这些peaks在各个染色体上的分布,全局浏览
#peaks在所有染色体上的分布
covplot(peak,weightCol="V5")
#只看peaks在chr1,chr2上的分布
covplot(peak, chr = c("chr1", "chr2"))
peaks在所有基因的启动子附近的分布情况
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
#定义tss上下游的距离
tagMatrix <- getTagMatrix(peak, windows=promoter)
tagHeatmap(tagMatrix, xlim=c(-3000, 3000), color="red")
热图每一行代表一个基因,展示的是所有基因TSS两侧的分布,除了热图外,还可以对所有基因取均值,用折线图来展示TSS两侧分布情况,用法如下
The Average Profile of ChIP peaks binding to TSS region
查看peaks在所有基因的启动子附近的分布情况,信号强度曲线图
plotAvgProf(tagMatrix, xlim=c(-3000, 3000),
xlab="Genomic Region (5' -> 3')", ylab = "Read Count Frequency")
peaks的注释
peakAnno <- annotatePeak(peak, tssRegion=c(-3000,3000),
TxDb=txdb, annoDb="org.Mm.eg.db")
peakAnno_df <- as.data.frame(peakAnno)
sampleName=strsplit(bedPeaksFile, '\\.')[1][1]
print(sampleName)
[1]"RNAPII_peaks"
write.csv(peakAnno_df, paste0(sampleName,'_peakAnno_df.csv'))
DT::datatable(peakAnno_df,
extensions='FixedColumns',
options =list(#dom='t',
scrollX =TRUE, fixedColumns = TRUE))
可以对peaks的性质做一些可视化,如下:
#png ('Pie-summarize the distribution of peaks over different type of features.png')
plotAnnoPie(peakAnno)
#png('Bar-summarize the distribution of peaks over different type of features.png')
plotAnnoBar(peakAnno)
#png('vennpie-summarize the distribution of peaks over different type of features.png')
#vennpie(peakAnno)
还可以查看peaks的长度分布,只统计长度在1000bp以下的peaks
peaksLength=abs(peakAnno_df$end-peakAnno_df$start)
peaksLength=peaksLength[peaksLength<1000]
hist(peaksLength, breaks =50, col = "lightblue", xlim=c(0,1000), xlab = "peak length", main="Histogram of peak length")
peaks相关基因的注释
peak的注释用annotatePeak,TSS (transcription start site) region 可以自己设定,默认是(-3000,3000),TxDb 是指某个物种的基因组,
可以把peaks先分类再注释,也可以直接拿所有peaks相关基因去富集分析,如果要分类,可以根据:
Promoter
5'UTR
3'UTR
Exon
Intron
Downstream
Intergenic
但是如果peaks本来就不多,那么分类之后基因太少,注释可能并没有意义。
genes=unique(peakAnno_df$ENSEMBL) #去除重复基因