下载CpG 岛的文件fromUCSC,不过坐标系统需要处理一下:
解释:
UCSC (0-based, 左闭右开):
区间写作 chrN:start-end,实际覆盖的碱基范围是 [start, end)。
坐标从 0 开始计数,最后一个碱基位置是 end - 1。
Ensembl (1-based, 闭区间):
区间写作 N:start-end,实际覆盖的碱基范围是 [start, end]。
坐标从 1 开始计数,最后一个碱基位置是 end
zcat cpgIsland_UCSCExt.txt.gz | \
awk -F'\t' '
BEGIN {OFS = FS} # 保持输入输出分隔符一致(制表符)
{
# 调整起始位置:0-based → 1-based
$3 = $3 + 1;
# 删除染色体名称的 "chr" 前缀(如 "chr1" → "1")
gsub(/^chr/, "", $2);
# 输出所有列(保留完整信息)
print
}' | \
gzip > cpgIslandExt_ensembl.txt.gz
methylKit Tutorial link:
https://bioconductor.org/packages/release/bioc/vignettes/methylKit/inst/doc/methylKit.html#4_Annotating_differentially_methylated_bases_or_regions
一些概念:
image.png
My R script:
rm(list=ls())
setwd("~/workspace/WGBS/3.Mapping/extractor/")
library("methylKit")
file.list=list( system.file("extdata",
"test1.myCpG.txt", package = "methylKit"),
system.file("extdata",
"test2.myCpG.txt", package = "methylKit"),
system.file("extdata",
"control1.myCpG.txt", package = "methylKit"),
system.file("extdata",
"control2.myCpG.txt", package = "methylKit") )
# read the files to a methylRawList object: myobj
myobj=methRead(file.list,
sample.id=list("test1","test2","ctrl1","ctrl2"),
assembly="hg18",
treatment=c(1,1,0,0),
context="CpG",
mincov = 10
)
####for **big data** using DB to process
{myobjDB=methRead(file.list,
sample.id=list("test1","test2","ctrl1","ctrl2"),
assembly="hg18",
treatment=c(1,1,0,0),
context="CpG",
dbtype = "tabix",
dbdir = "methylDB"
)
print(myobjDB[[1]]@dbpath)
}
# -------------------------------------------------------------------------
## note: coverage mean methylated C + unmethylated C
getMethylationStats(myobj[[2]],plot=FALSE,both.strands=FALSE)
getMethylationStats(myobj[[2]],plot=TRUE,both.strands=FALSE)
getCoverageStats(myobj[[2]],plot=TRUE,both.strands=FALSE)
# filter ------------------------------------------------------------------
# keep coverage between 10 and 99.9%(percentile)
filtered.myobj=filterByCoverage(myobj,lo.count=10,lo.perc=NULL,
hi.count=NULL,hi.perc=99.9)
getCoverageStats(filtered.myobj[[2]],plot=TRUE,both.strands=FALSE)
# Comparative analysis-------------------------------------------------------------------------
meth=unite(myobj, destrand=FALSE)
# collect all samples together
head(meth)
# Correlation -------------------------------------------------------------
getCorrelation(meth,plot=TRUE)
clusterSamples(meth, dist="correlation", method="ward.D2", plot=TRUE)
hc = clusterSamples(meth, dist="correlation", method="ward.D2", plot=FALSE)
PCASamples(meth, screeplot=TRUE)
PCASamples(meth)
# -------------------------------------------------------------------------
####我认为不是很常用
sampleAnnotation = data.frame(batch_id = c("a","a","b","b"),
age = c(19,34,23,40))
as = assocComp(mBase = meth,sampleAnnotation)
#移除第一个主成分:
newObj=removeComp(meth,comp=1)
# -------------------------------------------------------------------------
myDiff=calculateDiffMeth(meth)
# get hyper methylated bases
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
#
# get hypo methylated bases
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
#
#
# get all differentially methylated bases
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)
head(myDiff25p)
# -------------------------------------------------------------------------
#Differentially Methylated Positions (DMPs)
myDiffall=getMethylDiff(myDiff,difference=1,qvalue=1)
df=myDiffall
df$type=ifelse(df$pvalue>0.05,'none',
ifelse( df$meth.diff >25,'hyperDMP',
ifelse( df$meth.diff< -25,'hypoDMP','none') )
)
head(df)
table(df$type)
# -------------------------------------------------------------------------
this_tile <- paste0('Cutoff for meth.diff is ',25,
'\nThe number of hyperDMP is ',nrow(df[df$type == 'hyperDMP',]) ,
'\nThe number of hypoDMP is ',nrow(df[df$type == 'hypoDMP',])
)
#火山图
library(ggplot2)
p5 <- ggplot(data = df,
aes(x = meth.diff,
y = -log10(pvalue))) +
geom_point(alpha=0.6, size=1.5,
aes(color=type)) +
ylab("-log10(Pvalue)")+
scale_color_manual(values=c("#34bfb5", "#828586","#ff6633"))+
geom_vline(xintercept= c(-25,25),lty=4,col="grey",lwd=0.8) +
geom_hline(yintercept = -log10(0.05),lty=4,col="grey",lwd=0.8)+
#xlim(-0.5, 0.5)+
theme_classic()+
ggtitle(this_tile )+
theme(plot.title = element_text(size=12,hjust = 0.5),
legend.title = element_blank()
)
p5
#*****Tutorial
# Analysis Differential Methylated Regions-------------------------------------------------------------------------
#Tiling windows analysis
myobj_lowCov = methRead(file.list,
sample.id=list("test1","test2","ctrl1","ctrl2"),
assembly="hg18",
treatment=c(1,1,0,0),
context="CpG",
mincov = 3
)
tiles = tileMethylCounts(myobj_lowCov,win.size=1000,step.size=1000,cov.bases = 10)
head(tiles[[1]],5) # test1 中的DMR信息
# find DMP or DMR ---------------------------------------------------------
myDiff=calculateDiffMeth(meth) # mc.cores=2 有多核选项
# get hyper methylated bases
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
#
# get hypo methylated bases
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
#
# get all differentially methylated bases
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)
bedgraph(myDiff25p,file.name="./diff.bedgraph",col.name="meth.diff")
diffMethPerChr(myDiff,plot=FALSE,qvalue.cutoff=0.01, meth.cutoff=25)
diffMethPerChr(myDiff,plot=T,qvalue.cutoff=0.01, meth.cutoff=25)
# Annotation --------------------------------------------------------------
.libPaths(c(.libPaths(), "/home/u24211510018/.conda/envs/genomation/lib/R/library"))
library(genomation)
# read the gene BED file
gene.obj=readTranscriptFeatures(system.file("extdata", "refseq.hg18.bed.txt",
package = "methylKit"))
annotateWithGeneParts(as(myDiff25p,"GRanges"),gene.obj)
### 造成这样现象的解释是:有外显子必有启动子,有启动子没有外显子
# percentage of target features overlapping with annotation:
# promoter exon intron intergenic
# 27.82 15.04 34.59 57.14
#
# percentage of target features overlapping with annotation:
# (with promoter > exon > intron precedence):
# promoter exon intron intergenic
# 27.82 0.00 15.04 57.14
# 对于基因的每个部分(如启动子、外显子、内含子),这些区域的边界与目标特征重叠的比例
# percentage of annotation boundaries with feature overlap:
# promoter exon intron
# 0.29 0.03 0.17
# read the shores and flanking regions and name the flanks as shores
# and CpG islands as CpGi
cpg.obj=readFeatureFlank(system.file("extdata", "cpgi.hg18.bed.txt",
package = "methylKit"),
feature.flank.name=c("CpGi","shores"))
#
# convert methylDiff object to GRanges and annotate
diffCpGann=annotateWithFeatureFlank(as(myDiff25p,"GRanges"),
cpg.obj$CpGi,cpg.obj$shores,
feature.name="CpGi",flank.name="shores")
# Regional analysis-------------------------------------------------------------------------
## 计算在promoter上,甲基化的水平
promoters=regionCounts(myobj,gene.obj$promoters)
head(promoters[[1]])
# -------------------------------------------------------------------------
diffAnn=annotateWithGeneParts(as(myDiff25p,"GRanges"),gene.obj)
# target.row is the row number in myDiff25p
head(getAssociationWithTSS(diffAnn))
getTargetAnnotationStats(diffAnn,percentage=TRUE,precedence=TRUE)
# -------------------------------------------------------------------------
plotTargetAnnotation(diffAnn,precedence=T,
main="differential methylation annotation")
plotTargetAnnotation(diffCpGann,col=c("green","gray","white"),
main="differential methylation annotation")
# 返回的是与promoter/exon/intron边界重叠的情况
getFeatsWithTargetsStats(diffAnn,percentage=TRUE)
# selectByOverlap-------------------------------------------------------------------------
library(GenomicRanges)
my.win=GRanges(seqnames="chr21",
ranges=IRanges(start=seq(from=9764513,by=10000,length.out=20),width=5000) )
# selects the records that lie inside the regions
selectByOverlap(myobj[[1]],my.win)
# Getting percent methylation matrix from methylBase objects --------------
# creates a matrix containing percent methylation values
perc.meth=percMethylation(meth)
head(perc.meth)
#######################################
# 提取甲基化程度矩阵
perc.meth = percMethylation(meth)
# 提取甲基化位点的染色体和位置信息
meth.pos = getData(meth)[, 1:3] # 提取 chr, start, end
colnames(meth.pos) = c("chr", "start", "end")
# 将甲基化程度和位置信息合并
meth.data = cbind(meth.pos, perc.meth)
head(meth.data)
library(ggplot2)
library(reshape2)
# 将数据转换为长格式(便于ggplot2绘图)
meth.data.long = melt(meth.data, id.vars = c("chr", "start", "end"),
variable.name = "sample", value.name = "methylation")
# 绘制单个染色体的甲基化程度图(以 chr1 为例)
chr1.data = subset(meth.data.long, chr == "chr21")
ggplot(chr1.data, aes(x = start, y = methylation, color = sample)) +
geom_point(alpha = 0.6, size = 0.5) + # 绘制散点图
labs(x = "Chromosome Position (bp)", y = "Methylation Level (%)",
title = "Methylation Level along Chromosome 21") +
theme_minimal() +
theme(legend.position = "bottom")
# -------------------------------------------------------------------------
# 提取目标基因的所有外显子
target.exons = gene.obj$exons[gene.obj$exons$name == "NM_199260", ]
target.exons
# 提取目标基因的起始和终止位置
gene.start = min(start(target.exons)) # 所有外显子的最小起始位置
gene.end = max(end(target.exons)) # 所有外显子的最大终止位置
# 打印目标基因的范围
cat("Target gene NM_007186 spans from", gene.start, "to", gene.end, "on chromosome 21.\n")
# 提取目标基因的甲基化数据
target.meth = getData(meth)[getData(meth)$chr == "chr21" &
getData(meth)$start >= gene.start &
getData(meth)$end <= gene.end, ]
# 提取甲基化程度
perc.meth = percMethylation(meth)
perc.meth = perc.meth[c(as.numeric(rownames(target.meth))), ]
# 合并位置和甲基化程度信息
target.data = cbind(target.meth[, 1:3], perc.meth)
head(target.data)
library(reshape2)
# 将数据转换为长格式
target.data.long = melt(target.data, id.vars = c("chr", "start", "end"),
variable.name = "sample", value.name = "methylation")
head(target.data.long)
library(ggplot2)
# 绘制目标基因的甲基化程度图
ggplot(target.data.long, aes(x = start, y = methylation, color = sample)) +
geom_point(alpha = 0.6, size = 2) + # 绘制散点图
labs(x = "Gene Position (bp)", y = "Methylation Level (%)",
title = "Methylation Level across NM_007186 Gene") +
theme_minimal() +
theme(legend.position = "bottom")
library(gggenes)
gene.structure = data.frame(
start = start(target.exons),
end = end(target.exons),
type = "exon"
)
# 绘制基因结构图
gene.plot = ggplot(gene.structure, aes(xmin = start, xmax = end, y = 1)) +
geom_gene_arrow(fill = "lightblue") +
labs(x = "Gene Position (bp)", y = "", title = "NM_007186 Gene Structure") +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
print(gene.plot)
library(patchwork)
# 甲基化程度图
meth.plot = ggplot(target.data.long, aes(x = start, y = methylation, color = sample)) +
geom_point(alpha = 0.6, size = 2) +
labs(x = "Gene Position (bp)", y = "Methylation Level (%)") +
theme_minimal() +
theme(legend.position = "bottom")
# 结合两个图
combined.plot = meth.plot / gene.plot +
plot_layout(heights = c(3, 1)) # 调整上下图的高度比例
print(combined.plot)
出图:
image.png
image.png
image.png
image.png
image.png
image.png
image.png