说明,这是搬运的官网教程,我写了注释/我的理解等等,可以说是自己的笔记。
喜欢的留留言,动动小手点点赞呀!😘
参考:
R script
rm(list = ls())
setwd("~/workspace/MeDIP_Project/")
data(samplesNSCLC, package="MEDIPSData")
samples_NSCLC
path=system.file("extdata", package="MEDIPSData")
samples_NSCLC$file_name=paste0(path,"/",samples_NSCLC$file_name)
head(samples_NSCLC)
library(qsea)
library(BSgenome.Hsapiens.UCSC.hg19)
# I need prepare a table for data.
# BSgenome: The reference genome name as defined by BSgenome. (required)
# window_size: defines the genomic resolution by which short read coverage is calculated. (default:250 bases)
# chr.select: Only data at the specified chromosomes will be processed. (default: NULL=all chromosomes)
# Regions: If specified, only data in the specified regions will be processed. GRanges object. If specified, only selected regions are processed
qseaSet=createQseaSet(sampleTable=samples_NSCLC,
BSgenome="BSgenome.Hsapiens.UCSC.hg19",
chr.select=paste0("chr", 20:22),
window_size=500)
qseaSet
# 计算在每一个窗口的覆盖度,uniquePos = TRUE,表示是否将起止位置完全相同的读段视为 PCR 重复 并合并为一个,去重!!!
qseaSet=addCoverage(qseaSet, uniquePos=TRUE, paired=TRUE)
# 由于 CNV 扩增可能会导致,甲基化片段增多,所以需要对 CNV 进行矫正,但是在cfDNA我认为不是很需要
# 因为我们只需要关注变多的 read,而不关心怎么变多的。这行代码演示的是qsea自带的 internal CNV 推断,最好是用input_file来进行 CNV 的推断。
# Note: 我认为我是不需要的
# 内部估计 CNV 逻辑
# → 读取 MeDIP bam
# → 过滤掉所有含 CpG 的 reads(判断序列)
# → 保留“不含 CpG”的 reads
# → 按 2 Mb 窗口统计这些 reads 的覆盖度
# → 输入 HMM 模型划分 CNV 状态
qseaSet=addCNV(qseaSet, file_name="file_name",window_size=2e6,
paired=TRUE, parallel=FALSE, MeDIP=TRUE)
pdf("./CNV_QSEA.pdf")
plotCNV(qseaSet)
dev.off()
# -------------------------------------------------------------------------
# estimate library size
qseaSet=addLibraryFactors(qseaSet)
# we estimate the average CpG density per fragment for each genomic window.
qseaSet=addPatternDensity(qseaSet, "CG", name="CpG")
# From the regions without CpGs we can estimate the coverage offset from background reads.
# estimate non-specific bind in background,为了后续
qseaSet = addOffset(qseaSet, enrichmentPattern = "CpG")
# -------------------------------------------------------------------------
# 建模 MeDIP-seq 富集效率与 CpG 密度的关系,从而校正测序覆盖度的偏差,
# 最终将 MeDIP-seq 数据转换为 绝对甲基化水平(类似 BS-seq 的 β 值)
# 我需要找一个 TCGA 的 pancreatic 的数据 450k 数据
data(tcga_luad_lusc_450kmeth, package="MEDIPSData")
wd=findOverlaps(tcga_luad_lusc_450kmeth, getRegions(qseaSet), select="first")
# wd 为,tcga 数据库的第一个区间,匹配到了qseaSet的第 n 个区间
signal=as.matrix(mcols(tcga_luad_lusc_450kmeth)[,rep(1:2,3)])
# signal 是窗口的甲基化水平, wd为重叠的区间索引, rep(1:2,3)变成 T/N/T/N/T/N/
qseaSet=addEnrichmentParameters(qseaSet, enrichmentPattern="CpG",
windowIdx=wd, signal=signal)
# -------------------------------------------------------------------------
# check the estimated fraction of background reads
getOffset(qseaSet, scale="fraction")
# -------------------------------------------------------------------------
# plot enrichment profiles, x axis is
pdf("./enrichment_profiles.pdf")
plotEPmatrix(qseaSet)
dev.off()
# -------------------------------------------------------------------------
# 获取注释数据
data(annotation, package="MEDIPSData")
# 定义 CGIsland_promoter
CGIprom=intersect(ROIs[["CpG Island"]], ROIs[["TSS"]],ignore.strand=TRUE)
# 把CGIprom作为 Region of Interested (ROIs),来绘制 PCA
pca_cgi=getPCA(qseaSet, norm_method="beta", ROIs=CGIprom)
col=rep(c("red", "green"), 3)
pdf("./PCA_QSEA.pdf")
plotPCA(pca_cgi, bg=col, main="PCA plot based on CpG Island Promoters")
dev.off()
# -------------------------------------------------------------------------
# contrast matrix
design=model.matrix(~group, getSampleTable(qseaSet) )
qseaGLM=fitNBglm(qseaSet, design, norm_method="beta")
# coef=2 对比 design 矩阵的第 2 列(即 groupTumor)
qseaGLM=addContrast(qseaSet,qseaGLM, coef=2, name="TvN" )
# -------------------------------------------------------------------------
library(GenomicRanges)
sig=isSignificant(qseaGLM, fdr_th=0.01)
## selecting regions from contrast TvN
## selected 1006/169189 windows
result=makeTable(qseaSet,
glm=qseaGLM,
groupMeans=getSampleGroups(qseaSet),
keep=sig,
annotation=ROIs,
norm_method="beta")
## adding test results from TvN
## obtaining raw values for 6 samples in 1006 windows
## deriving beta values...
## adding annotation
## creating table...
head(result)
# -------------------------------------------------------------------------
# isSignificant中的参数function (glm, contrast = NULL, fdr_th = NULL, pval_th = NULL, absLogFC_th = NULL, direction = "both")
# 设置阈值,导出 significant result of gain(hyper-methylated) and loss(hypo-methylated)
sigList=list(gain=isSignificant(qseaGLM, fdr_th=0.1,direction="gain"),
loss=isSignificant(qseaGLM, fdr_th=0.1,direction="loss"))
## selecting regions from contrast TvN
## selected 725/169189 windows
## selecting regions from contrast TvN
## selected 2842/169189 windows
roi_stats=regionStats(qseaSet, subsets=sigList, ROIs=ROIs)
## getting numbers of overlaps with total qs
## getting numbers of overlaps with gain
## getting numbers of overlaps with loss
# > roi_stats
# total gain loss
# all Regions 324919 624 2120
# 符合肿瘤全局低甲基化的特征
# -------------------------------------------------------------------------
# 计算 gain/loss 占total的比例
roi_stats_rel=roi_stats[,-1]/roi_stats[,1]
x=barplot(t(roi_stats_rel)*100,ylab="fraction of ROIs[%]",
names.arg=rep("", length(ROIs)+1), beside=TRUE, legend=TRUE,
las=2, args.legend=list(x="topleft"),
main="Feature enrichment Tumor vs Normal DNA methylation")
# 七列代表了七个 features,两行分别代表了 gain/loss bar图的中心位置
# > x
# [,1] [,2] [,3] [,4] [,5] [,6] [,7]
# [1,] 1.5 4.5 7.5 10.5 13.5 16.5 19.5
# [2,] 2.5 5.5 8.5 11.5 14.5 17.5 20.5
text(x=x[2,],y=-.15,labels=rownames(roi_stats_rel), xpd=TRUE, srt=30, cex=1, adj=c(1,0))
# -------------------------------------------------------------------------
# 如果你有感兴趣的 region,y 轴为甲基化 β 值(0~1,1=完全甲基化)
# TFBS 为 transcriptional factor binding site
plotCoverage(qseaSet, samples=getSampleNames(qseaSet),
chr="chr20", start=38076001, end=38090000, norm_method="beta",
col=rep(c("#1F77B4", "#FF7F0E"), 3), yoffset=1,space=.1, reorder="clust",
regions=ROIs["TFBS"],regions_offset=.5, cex=.7 )
# -------------------------------------------------------------------------
# 还可以通过多线程加速。
library("BiocParallel")
register(MulticoreParam(workers=3))
qseaSet=addCoverage(qseaSet, uniquePos=TRUE, paired=TRUE, parallel=TRUE)
Plot
Feature enrichment
beta plot in interested region
PCA plot
CpG density and rpkm
CNV estimated from QSEA internal function