下载CpG 岛的文件fromUCSC，不过坐标系统需要处理一下：

解释：
UCSC (0-based, 左闭右开)：

区间写作 chrN:start-end，实际覆盖的碱基范围是 [start, end)。

坐标从 0 开始计数，最后一个碱基位置是 end - 1。

Ensembl (1-based, 闭区间)：

区间写作 N:start-end，实际覆盖的碱基范围是 [start, end]。

坐标从 1 开始计数，最后一个碱基位置是 end



zcat cpgIsland_UCSCExt.txt.gz | \
awk -F'\t' '
BEGIN {OFS = FS}  # 保持输入输出分隔符一致（制表符）
{
    # 调整起始位置：0-based → 1-based
    $3 = $3 + 1;
    # 删除染色体名称的 "chr" 前缀（如 "chr1" → "1"）
    gsub(/^chr/, "", $2);
    # 输出所有列（保留完整信息）
    print
}' | \
gzip > cpgIslandExt_ensembl.txt.gz

methylKit Tutorial link:
https://bioconductor.org/packages/release/bioc/vignettes/methylKit/inst/doc/methylKit.html#4_Annotating_differentially_methylated_bases_or_regions

一些概念：

image.png

My R script:

rm(list=ls())
setwd("~/workspace/WGBS/3.Mapping/extractor/")
library("methylKit")

file.list=list( system.file("extdata", 
                           "test1.myCpG.txt", package = "methylKit"),
               system.file("extdata",
                           "test2.myCpG.txt", package = "methylKit"),
               system.file("extdata", 
                           "control1.myCpG.txt", package = "methylKit"),
               system.file("extdata", 
                           "control2.myCpG.txt", package = "methylKit") )


# read the files to a methylRawList object: myobj
myobj=methRead(file.list,
              sample.id=list("test1","test2","ctrl1","ctrl2"),
              assembly="hg18",
              treatment=c(1,1,0,0),
              context="CpG",
              mincov = 10
)


####for **big data** using DB to process
{myobjDB=methRead(file.list,
                  sample.id=list("test1","test2","ctrl1","ctrl2"),
                  assembly="hg18",
                  treatment=c(1,1,0,0),
                  context="CpG",
                  dbtype = "tabix",
                  dbdir = "methylDB"
 )
 
 print(myobjDB[[1]]@dbpath)
}


# -------------------------------------------------------------------------
## note: coverage mean methylated C + unmethylated C 

getMethylationStats(myobj[[2]],plot=FALSE,both.strands=FALSE)

getMethylationStats(myobj[[2]],plot=TRUE,both.strands=FALSE)

getCoverageStats(myobj[[2]],plot=TRUE,both.strands=FALSE)


# filter ------------------------------------------------------------------

# keep coverage between 10 and 99.9%(percentile)
filtered.myobj=filterByCoverage(myobj,lo.count=10,lo.perc=NULL,
                               hi.count=NULL,hi.perc=99.9)
getCoverageStats(filtered.myobj[[2]],plot=TRUE,both.strands=FALSE)



# Comparative analysis-------------------------------------------------------------------------
meth=unite(myobj, destrand=FALSE)

# collect all samples together
head(meth)


# Correlation -------------------------------------------------------------


getCorrelation(meth,plot=TRUE)

clusterSamples(meth, dist="correlation", method="ward.D2", plot=TRUE)

hc = clusterSamples(meth, dist="correlation", method="ward.D2", plot=FALSE)

PCASamples(meth, screeplot=TRUE)

PCASamples(meth)


# -------------------------------------------------------------------------
####我认为不是很常用
sampleAnnotation = data.frame(batch_id = c("a","a","b","b"),
                             age = c(19,34,23,40))
as = assocComp(mBase = meth,sampleAnnotation)
#移除第一个主成分：
newObj=removeComp(meth,comp=1)


# -------------------------------------------------------------------------


myDiff=calculateDiffMeth(meth)

# get hyper methylated bases
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
#
# get hypo methylated bases
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
#
#
# get all differentially methylated bases
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)
head(myDiff25p)


# -------------------------------------------------------------------------

#Differentially Methylated Positions (DMPs)
myDiffall=getMethylDiff(myDiff,difference=1,qvalue=1)
df=myDiffall
df$type=ifelse(df$pvalue>0.05,'none',
              ifelse( df$meth.diff >25,'hyperDMP', 
                      ifelse( df$meth.diff< -25,'hypoDMP','none') )
)
head(df)

table(df$type)


# -------------------------------------------------------------------------


this_tile <- paste0('Cutoff for meth.diff is ',25,
                   '\nThe number of hyperDMP is ',nrow(df[df$type == 'hyperDMP',]) ,
                   '\nThe number of hypoDMP is ',nrow(df[df$type == 'hypoDMP',])
)
#火山图
library(ggplot2)
p5 <- ggplot(data = df, 
            aes(x = meth.diff, 
                y = -log10(pvalue))) +
 geom_point(alpha=0.6, size=1.5, 
            aes(color=type)) +
 ylab("-log10(Pvalue)")+
 scale_color_manual(values=c("#34bfb5", "#828586","#ff6633"))+
 geom_vline(xintercept= c(-25,25),lty=4,col="grey",lwd=0.8) +
 geom_hline(yintercept = -log10(0.05),lty=4,col="grey",lwd=0.8)+
 #xlim(-0.5, 0.5)+
 theme_classic()+
 ggtitle(this_tile )+
 theme(plot.title = element_text(size=12,hjust = 0.5),
       legend.title = element_blank()
 )
p5



#*****Tutorial
# Analysis Differential Methylated Regions-------------------------------------------------------------------------
#Tiling windows analysis

myobj_lowCov = methRead(file.list,
                       sample.id=list("test1","test2","ctrl1","ctrl2"),
                       assembly="hg18",
                       treatment=c(1,1,0,0),
                       context="CpG",
                       mincov = 3
)

tiles = tileMethylCounts(myobj_lowCov,win.size=1000,step.size=1000,cov.bases = 10)
head(tiles[[1]],5) # test1 中的DMR信息


# find DMP or DMR ---------------------------------------------------------

myDiff=calculateDiffMeth(meth) # mc.cores=2 有多核选项

# get hyper methylated bases
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper") 
#
# get hypo methylated bases
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
#
# get all differentially methylated bases
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)

bedgraph(myDiff25p,file.name="./diff.bedgraph",col.name="meth.diff")


diffMethPerChr(myDiff,plot=FALSE,qvalue.cutoff=0.01, meth.cutoff=25)

diffMethPerChr(myDiff,plot=T,qvalue.cutoff=0.01, meth.cutoff=25)


# Annotation --------------------------------------------------------------
.libPaths(c(.libPaths(), "/home/u24211510018/.conda/envs/genomation/lib/R/library"))
library(genomation)

# read the gene BED file
gene.obj=readTranscriptFeatures(system.file("extdata", "refseq.hg18.bed.txt", 
                                           package = "methylKit"))

annotateWithGeneParts(as(myDiff25p,"GRanges"),gene.obj)

### 造成这样现象的解释是：有外显子必有启动子，有启动子没有外显子
# percentage of target features overlapping with annotation:
#   promoter       exon     intron intergenic 
# 27.82      15.04      34.59      57.14 
# 
# percentage of target features overlapping with annotation:
#   (with promoter > exon > intron precedence):
#   promoter       exon     intron intergenic 
# 27.82       0.00      15.04      57.14 

# 对于基因的每个部分（如启动子、外显子、内含子），这些区域的边界与目标特征重叠的比例
# percentage of annotation boundaries with feature overlap:
#   promoter     exon   intron 
# 0.29     0.03     0.17 


# read the shores and flanking regions and name the flanks as shores 
# and CpG islands as CpGi
cpg.obj=readFeatureFlank(system.file("extdata", "cpgi.hg18.bed.txt", 
                                    package = "methylKit"),
                        feature.flank.name=c("CpGi","shores"))
#
# convert methylDiff object to GRanges and annotate
diffCpGann=annotateWithFeatureFlank(as(myDiff25p,"GRanges"),
                                   cpg.obj$CpGi,cpg.obj$shores,
                                   feature.name="CpGi",flank.name="shores")




# Regional analysis-------------------------------------------------------------------------

## 计算在promoter上，甲基化的水平
promoters=regionCounts(myobj,gene.obj$promoters)

head(promoters[[1]])



# -------------------------------------------------------------------------

diffAnn=annotateWithGeneParts(as(myDiff25p,"GRanges"),gene.obj)

# target.row is the row number in myDiff25p
head(getAssociationWithTSS(diffAnn))
getTargetAnnotationStats(diffAnn,percentage=TRUE,precedence=TRUE)


# -------------------------------------------------------------------------

plotTargetAnnotation(diffAnn,precedence=T,
                    main="differential methylation annotation")

plotTargetAnnotation(diffCpGann,col=c("green","gray","white"),
                    main="differential methylation annotation")
# 返回的是与promoter/exon/intron边界重叠的情况
getFeatsWithTargetsStats(diffAnn,percentage=TRUE)



# selectByOverlap-------------------------------------------------------------------------

library(GenomicRanges)
my.win=GRanges(seqnames="chr21",
              ranges=IRanges(start=seq(from=9764513,by=10000,length.out=20),width=5000) )

# selects the records that lie inside the regions
selectByOverlap(myobj[[1]],my.win)


# Getting percent methylation matrix from methylBase objects --------------

# creates a matrix containing percent methylation values
perc.meth=percMethylation(meth)
head(perc.meth)

#######################################

# 提取甲基化程度矩阵
perc.meth = percMethylation(meth)

# 提取甲基化位点的染色体和位置信息
meth.pos = getData(meth)[, 1:3]  # 提取 chr, start, end
colnames(meth.pos) = c("chr", "start", "end")

# 将甲基化程度和位置信息合并
meth.data = cbind(meth.pos, perc.meth)
head(meth.data)

library(ggplot2)
library(reshape2)

# 将数据转换为长格式（便于ggplot2绘图）
meth.data.long = melt(meth.data, id.vars = c("chr", "start", "end"),
                     variable.name = "sample", value.name = "methylation")

# 绘制单个染色体的甲基化程度图（以 chr1 为例）
chr1.data = subset(meth.data.long, chr == "chr21")

ggplot(chr1.data, aes(x = start, y = methylation, color = sample)) +
 geom_point(alpha = 0.6, size = 0.5) +  # 绘制散点图
 labs(x = "Chromosome Position (bp)", y = "Methylation Level (%)",
      title = "Methylation Level along Chromosome 21") +
 theme_minimal() +
 theme(legend.position = "bottom")


# -------------------------------------------------------------------------

# 提取目标基因的所有外显子
target.exons = gene.obj$exons[gene.obj$exons$name == "NM_199260", ]
target.exons

# 提取目标基因的起始和终止位置
gene.start = min(start(target.exons))  # 所有外显子的最小起始位置
gene.end = max(end(target.exons))      # 所有外显子的最大终止位置

# 打印目标基因的范围
cat("Target gene NM_007186 spans from", gene.start, "to", gene.end, "on chromosome 21.\n")

# 提取目标基因的甲基化数据
target.meth = getData(meth)[getData(meth)$chr == "chr21" & 
                             getData(meth)$start >= gene.start & 
                             getData(meth)$end <= gene.end, ]

# 提取甲基化程度
perc.meth = percMethylation(meth)
perc.meth = perc.meth[c(as.numeric(rownames(target.meth))), ]


# 合并位置和甲基化程度信息
target.data = cbind(target.meth[, 1:3], perc.meth)
head(target.data)

library(reshape2)

# 将数据转换为长格式
target.data.long = melt(target.data, id.vars = c("chr", "start", "end"),
                       variable.name = "sample", value.name = "methylation")
head(target.data.long)

library(ggplot2)

# 绘制目标基因的甲基化程度图
ggplot(target.data.long, aes(x = start, y = methylation, color = sample)) +
 geom_point(alpha = 0.6, size = 2) +  # 绘制散点图
 labs(x = "Gene Position (bp)", y = "Methylation Level (%)",
      title = "Methylation Level across NM_007186 Gene") +
 theme_minimal() +
 theme(legend.position = "bottom")

library(gggenes)
gene.structure = data.frame(
 start = start(target.exons),
 end = end(target.exons),
 type = "exon"
)


# 绘制基因结构图
gene.plot = ggplot(gene.structure, aes(xmin = start, xmax = end, y = 1)) +
 geom_gene_arrow(fill = "lightblue") +
 labs(x = "Gene Position (bp)", y = "", title = "NM_007186 Gene Structure") +
 theme_minimal() +
 theme(axis.text.y = element_blank(),
       axis.ticks.y = element_blank())

print(gene.plot)

library(patchwork)

# 甲基化程度图
meth.plot = ggplot(target.data.long, aes(x = start, y = methylation, color = sample)) +
 geom_point(alpha = 0.6, size = 2) +
 labs(x = "Gene Position (bp)", y = "Methylation Level (%)") +
 theme_minimal() +
 theme(legend.position = "bottom")

# 结合两个图
combined.plot = meth.plot / gene.plot +
 plot_layout(heights = c(3, 1))  # 调整上下图的高度比例

print(combined.plot)