《全网最全QSEA学习教程!!!》

说明,这是搬运的官网教程,我写了注释/我的理解等等,可以说是自己的笔记。

喜欢的留留言,动动小手点点赞呀!😘

参考:

https://bioconductor.org/packages/release/bioc/vignettes/qsea/inst/doc/qsea_tutorial.html#qsea-workflow

R script

rm(list = ls())
setwd("~/workspace/MeDIP_Project/")


data(samplesNSCLC, package="MEDIPSData")
samples_NSCLC
path=system.file("extdata", package="MEDIPSData")
samples_NSCLC$file_name=paste0(path,"/",samples_NSCLC$file_name)
head(samples_NSCLC)
library(qsea)
library(BSgenome.Hsapiens.UCSC.hg19)
# I need prepare a table for data.

# BSgenome: The reference genome name as defined by BSgenome. (required)
# window_size: defines the genomic resolution by which short read coverage is calculated. (default:250 bases)
# chr.select: Only data at the specified chromosomes will be processed. (default: NULL=all chromosomes)
# Regions: If specified, only data in the specified regions will be processed. GRanges object. If specified, only selected regions are processed

qseaSet=createQseaSet(sampleTable=samples_NSCLC, 
                      BSgenome="BSgenome.Hsapiens.UCSC.hg19", 
                      chr.select=paste0("chr", 20:22), 
                      window_size=500)

qseaSet
# 计算在每一个窗口的覆盖度,uniquePos = TRUE,表示是否将起止位置完全相同的读段视为 PCR 重复 并合并为一个,去重!!!
qseaSet=addCoverage(qseaSet, uniquePos=TRUE, paired=TRUE)
# 由于 CNV 扩增可能会导致,甲基化片段增多,所以需要对 CNV 进行矫正,但是在cfDNA我认为不是很需要
# 因为我们只需要关注变多的 read,而不关心怎么变多的。这行代码演示的是qsea自带的 internal CNV 推断,最好是用input_file来进行 CNV 的推断。
# Note: 我认为我是不需要的
# 内部估计 CNV 逻辑
# → 读取 MeDIP bam
# → 过滤掉所有含 CpG 的 reads(判断序列)
# → 保留“不含 CpG”的 reads
# → 按 2 Mb 窗口统计这些 reads 的覆盖度
# → 输入 HMM 模型划分 CNV 状态

qseaSet=addCNV(qseaSet, file_name="file_name",window_size=2e6, 
               paired=TRUE, parallel=FALSE, MeDIP=TRUE)
pdf("./CNV_QSEA.pdf")
plotCNV(qseaSet)
dev.off()


# -------------------------------------------------------------------------


# estimate library size
qseaSet=addLibraryFactors(qseaSet)

# we estimate the average CpG density per fragment for each genomic window.
qseaSet=addPatternDensity(qseaSet, "CG", name="CpG")
# From the regions without CpGs we can estimate the coverage offset from background reads.
# estimate non-specific bind in background,为了后续
qseaSet = addOffset(qseaSet, enrichmentPattern = "CpG")

# -------------------------------------------------------------------------

# 建模 MeDIP-seq 富集效率与 CpG 密度的关系,从而校正测序覆盖度的偏差,
# 最终将 MeDIP-seq 数据转换为 绝对甲基化水平(类似 BS-seq 的 β 值)
# 我需要找一个 TCGA 的 pancreatic 的数据 450k 数据

data(tcga_luad_lusc_450kmeth, package="MEDIPSData")

wd=findOverlaps(tcga_luad_lusc_450kmeth, getRegions(qseaSet), select="first")
# wd 为,tcga 数据库的第一个区间,匹配到了qseaSet的第 n 个区间 
signal=as.matrix(mcols(tcga_luad_lusc_450kmeth)[,rep(1:2,3)])
# signal 是窗口的甲基化水平, wd为重叠的区间索引, rep(1:2,3)变成 T/N/T/N/T/N/
qseaSet=addEnrichmentParameters(qseaSet, enrichmentPattern="CpG", 
                                windowIdx=wd, signal=signal)

# -------------------------------------------------------------------------

# check the estimated fraction of background reads
getOffset(qseaSet, scale="fraction")

# -------------------------------------------------------------------------

# plot enrichment profiles, x axis is 
pdf("./enrichment_profiles.pdf")
plotEPmatrix(qseaSet)
dev.off()
# -------------------------------------------------------------------------
# 获取注释数据
data(annotation, package="MEDIPSData")
# 定义 CGIsland_promoter
CGIprom=intersect(ROIs[["CpG Island"]], ROIs[["TSS"]],ignore.strand=TRUE)
# 把CGIprom作为 Region of Interested (ROIs),来绘制 PCA
pca_cgi=getPCA(qseaSet, norm_method="beta", ROIs=CGIprom)
col=rep(c("red", "green"), 3)
pdf("./PCA_QSEA.pdf")
plotPCA(pca_cgi, bg=col, main="PCA plot based on CpG Island Promoters")
dev.off()

# -------------------------------------------------------------------------
# contrast matrix
design=model.matrix(~group, getSampleTable(qseaSet) )
qseaGLM=fitNBglm(qseaSet, design, norm_method="beta")
# coef=2 对比 design 矩阵的第 2 列(即 groupTumor)
qseaGLM=addContrast(qseaSet,qseaGLM, coef=2, name="TvN" )

# -------------------------------------------------------------------------

library(GenomicRanges)
sig=isSignificant(qseaGLM, fdr_th=0.01)
## selecting regions from contrast TvN
## selected 1006/169189 windows

result=makeTable(qseaSet, 
                 glm=qseaGLM, 
                 groupMeans=getSampleGroups(qseaSet), 
                 keep=sig, 
                 annotation=ROIs, 
                 norm_method="beta")
## adding test results from TvN
## obtaining raw values for 6 samples in 1006 windows
## deriving beta values...
## adding annotation
## creating table...
head(result)

# -------------------------------------------------------------------------

# isSignificant中的参数function (glm, contrast = NULL, fdr_th = NULL, pval_th = NULL, absLogFC_th = NULL, direction = "both") 
# 设置阈值,导出 significant result of gain(hyper-methylated) and loss(hypo-methylated)
sigList=list(gain=isSignificant(qseaGLM, fdr_th=0.1,direction="gain"),
             loss=isSignificant(qseaGLM, fdr_th=0.1,direction="loss"))
## selecting regions from contrast TvN
## selected 725/169189 windows
## selecting regions from contrast TvN
## selected 2842/169189 windows
roi_stats=regionStats(qseaSet, subsets=sigList, ROIs=ROIs)
## getting numbers of overlaps with total qs
## getting numbers of overlaps with gain
## getting numbers of overlaps with loss
# > roi_stats
# total gain loss
# all Regions 324919  624 2120
# 符合肿瘤全局低甲基化的特征

# -------------------------------------------------------------------------
# 计算 gain/loss 占total的比例
roi_stats_rel=roi_stats[,-1]/roi_stats[,1]
x=barplot(t(roi_stats_rel)*100,ylab="fraction of ROIs[%]",
          names.arg=rep("", length(ROIs)+1),  beside=TRUE, legend=TRUE, 
          las=2, args.legend=list(x="topleft"), 
          main="Feature enrichment Tumor vs Normal DNA methylation")
# 七列代表了七个 features,两行分别代表了 gain/loss bar图的中心位置
# > x
# [,1] [,2] [,3] [,4] [,5] [,6] [,7]
# [1,]  1.5  4.5  7.5 10.5 13.5 16.5 19.5
# [2,]  2.5  5.5  8.5 11.5 14.5 17.5 20.5
text(x=x[2,],y=-.15,labels=rownames(roi_stats_rel), xpd=TRUE, srt=30, cex=1, adj=c(1,0))

# -------------------------------------------------------------------------
# 如果你有感兴趣的 region,y 轴为甲基化 β 值(0~1,1=完全甲基化)
# TFBS 为 transcriptional factor binding site
plotCoverage(qseaSet, samples=getSampleNames(qseaSet), 
             chr="chr20", start=38076001, end=38090000, norm_method="beta", 
             col=rep(c("#1F77B4", "#FF7F0E"), 3), yoffset=1,space=.1, reorder="clust", 
             regions=ROIs["TFBS"],regions_offset=.5, cex=.7 ) 

# -------------------------------------------------------------------------
# 还可以通过多线程加速。

library("BiocParallel")
register(MulticoreParam(workers=3))
qseaSet=addCoverage(qseaSet, uniquePos=TRUE, paired=TRUE, parallel=TRUE)

Plot

Feature enrichment
beta plot in interested region
PCA plot
CpG density and rpkm
CNV estimated from QSEA internal function
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容