scATAC_Code
对单细胞测序感兴趣的可以先参考下面的课程:
Analysis of single cell RNA-seq data:https://scrnaseq-course.cog.sanger.ac.uk/website/index.html
先验知识包括:
scATAC建库
scATAC数据分析流程(上)
scATAC数据分析流程(下)
下游处理一共有8个steps,每个step我都会在代码块里写上详细的注释
代码和测试数据在:https://github.com/greenleaflab/10x-scatac-2019
每个step的结构是:先把完整的代码运行一遍,然后再分步分析每一步做了什么
Seurat的scATAC-seq分析流程:https://blog.csdn.net/u012110870/article/details/100171472
Step1-Filtering Cells based on TSS enrichment and unique fragments
完整代码
用到的函数
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(magrittr)
library(ggplot2)
library(Rcpp)
library(viridis)
#--------------------------------------------
# Functions
#--------------------------------------------
sourceCpp(code='
#include <Rcpp.h>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
IntegerMatrix tabulate2dCpp(IntegerVector x1, int xmin, int xmax, IntegerVector y1, int ymin, int ymax){
if(x1.size() != y1.size()){
stop("width must equal size!");
}
IntegerVector x = clone(x1);
IntegerVector y = clone(y1);
int n = x.size();
IntegerVector rx = seq(xmin,xmax);
IntegerVector ry = seq(ymin,ymax);
IntegerMatrix mat( ry.size() , rx.size() );
int xi,yi;
for(int i = 0; i < n; i++){
xi = (x[i] - xmin);
yi = (y[i] - ymin);
if(yi >= 0 && yi < ry.size()){
if(xi >= 0 && xi < rx.size()){
mat( yi , xi ) = mat( yi , xi ) + 1;
}
}
}
return mat;
}'
)
insertionProfileSingles <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){
insertionProfileSingles_helper <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){
#Convert To Insertion Sites
if(getInsertions){
insertions <- c(
GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]),
GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by])
)
by <- "RG"
}else{
insertions <- fragments
}
remove(fragments)
gc()
#center the feature
center <- unique(resize(feature, width = 1, fix = fix, ignore.strand = FALSE))
#get overlaps between the feature and insertions only up to flank bp,这里maxgap设置只要对象间距离小于flank bp即认为有overlap,默认设置为2000bp,也是文中作者采用的参数
overlap <- DataFrame(findOverlaps(query = center, subject = insertions, maxgap = flank, ignore.strand = TRUE))
overlap$strand <- strand(center)[overlap[,1]]
overlap$name <- mcols(insertions)[overlap[,2],by]
overlap <- transform(overlap, id=match(name, unique(name)))
ids <- length(unique(overlap$name))
#add unique identity (ID) (integer) cell barcode to the overlaps object
#distance
overlap$dist <- NA
minus <- which(overlap$strand == "-")
other <- which(overlap$strand != "-")
overlap$dist[minus] <- start(center[overlap[minus,1]]) - start(insertions[overlap[minus,2]])
overlap$dist[other] <- start(insertions[overlap[other,2]]) - start(center[overlap[other,1]])
#Insertion Mat 转换成插入片段矩阵
profile_mat <- tabulate2dCpp(x1 = overlap$id, y1 = overlap$dist, xmin = 1, xmax = ids, ymin = -flank, ymax = flank)
colnames(profile_mat) <- unique(overlap$name)
profile <- rowSums(profile_mat)
#normalize
profile_mat_norm <- apply(profile_mat, 2, function(x) x/max(mean(x[c(1:norm,(flank*2-norm+1):(flank*2+1))]), 0.5)) #Handles low depth cells
profile_norm <- profile/mean(profile[c(1:norm,(flank*2-norm+1):(flank*2+1))])
#smooth 滑动平均数
profile_mat_norm_smooth <- apply(profile_mat_norm, 2, function(x) zoo::rollmean(x, smooth, fill = 1))
profile_norm_smooth <- zoo::rollmean(profile_norm, smooth, fill = 1)
#enrichment
max_finite <- function(x){
suppressWarnings(max(x[is.finite(x)], na.rm=TRUE))
}
e_mat <- apply(profile_mat_norm_smooth, 2, function(x) max_finite(x[(flank-range):(flank+range)]))
names(e_mat) <- colnames(profile_mat_norm_smooth)
e <- max_finite(profile_norm_smooth[(flank-range):(flank+range)])
#Summary
df_mat <- data.frame(
enrichment = e_mat,
insertions = as.vector(table(mcols(insertions)[,by])[names(e_mat)]),
insertionsWindow = as.vector(table(overlap$name)[names(e_mat)])
)
df_sum <- data.frame(bp = (-flank):flank, profile = profile, norm_profile = profile_norm, smooth_norm_profile = profile_norm_smooth, enrichment = e)
rownames(df_sum) <- NULL
return(list(df = df_sum, dfall = df_mat, profileMat = profile_mat_norm, profileMatSmooth = profile_mat_norm_smooth))
}
uniqueTags <- as.character(unique(mcols(fragments)[,by]))
splitTags <- split(uniqueTags, ceiling(seq_along(uniqueTags)/batchSize))
pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)
batchTSS <- lapply(seq_along(splitTags), function(x){
setTxtProgressBar(pb, round(x * 100/length(splitTags), 0))
profilex <- insertionProfileSingles_helper(
feature=feature,
fragments=fragments[which(mcols(fragments)[,by] %in% splitTags[[x]])],
by = by,
getInsertions = getInsertions,
fix = fix,
flank = flank,
norm = norm,
smooth = smooth,
range = range
)
return(profilex)
})
df <- lapply(batchTSS, function(x) x$df) %>% Reduce("rbind",.)
dfall <- lapply(batchTSS, function(x) x$dfall) %>% Reduce("rbind",.)
profileMat <- lapply(batchTSS, function(x) x$profileMat) %>% Reduce("cbind",.)
profileMatSmooth <- lapply(batchTSS, function(x) x$profileMatSmooth) %>% Reduce("cbind",.)
return(list(df = df, dfall = dfall, profileMat = profileMat, profileMatSmooth = profileMatSmooth))
}
输入fragments文件、运行
#--------------------------------------------
# Input
#--------------------------------------------
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
minFrags <- 100
filterFrags <- 1000
filterTSS <- 8
file_fragments <- "data/PBMC_10x-Sub25M-fragments.tsv.gz"
out_fragments <- "data/PBMC_10x-Sub25M-fragments.gr.rds"
name <- "PBMC"
#-----------------
# Reading Fragment Files
#-----------------
message("Reading in fragment files...")
fragments <- data.frame(readr::read_tsv(file_fragments, col_names=FALSE))
#fragmentSub <- fragments[sample(seq_len(nrow(fragments)),25*10^6),]
#write.table(fragmentSub, "data/PBMC_10x-Sub25M-fragments.tsv.gz", col.names=FALSE, row.names =FALSE, sep = "\t", quote = FALSE)
#因为数据很大,读取需要一段时间
Parsed with column specification:
cols(
X1 = col_character(),
X2 = col_double(),
X3 = col_double(),
X4 = col_character(),
X5 = col_double()
)
|==============================================================| 100% 1070 MB
fragments <- GRanges(
seqnames = fragments[,1],
IRanges(fragments[,2]+1, fragments[,3]),
RG = fragments[,4],
N = fragments[,5]
)#先把读入的片段转成GRanges对象
message("Filtering Lowly Represented Cells...")
tabRG <- table(fragments$RG)
keep <- names(tabRG)[which(tabRG >= minFrags)]#要求每个细胞的片段数至少100个,这是降采样的reads,所以不要求很多
fragments <- fragments[fragments$RG %in% keep,]#过滤以后从25M降低至约21M
fragments <- sort(sortSeqlevels(fragments))
#-----------------
# TSS Profile
#-----------------
feature <- txdb %>% transcripts(.) %>% resize(., width = 1, fix = "start") %>% unique#选取的feature为transcripts,并归一化至1bp的起始位点也就是tss
tssProfile <- insertionProfileSingles(feature = feature, fragments = fragments,
getInsertions = TRUE, batchSize = 1000)
tssSingles <- tssProfile$dfall
tssSingles$uniqueFrags <- 0#创建一栏标记每个细胞有多少fragments
tssSingles[names(tabRG),"uniqueFrags"] <- tabRG
#统计每个细胞的uniqueFrags数,后续筛选
head(tssSingles)
enrichment insertions insertionsWindow uniqueFrags
TAGCCGGAGTGAATAC-1 6.196078 3238 2164 1619
TACCTATAGGACTTTC-1 6.470588 2738 1906 1369
TGACAACCATTGTTCT-1 8.274510 3836 3136 1918
ACAAAGAGTAACTCCA-1 6.117647 3840 2411 1920
GAGATTCCACATGATC-1 1.529412 504 408 252
TGATCAGAGGAAGGTA-1 3.058824 2850 1219 1425
tssSingles$cellCall <- 0#We then filtered all scATAC-seq profiles to keep those that had at least 1,000 unique fragments and a TSS enrichment of 8.
tssSingles$cellCall[tssSingles$uniqueFrags >= filterFrags & tssSingles$enrichment >= filterTSS] <- 1
可视化
#-----------------
# Plot Stats
#-----------------
tssSingles <- tssSingles[complete.cases(tssSingles),]#complete.cases返回数据是否缺失的信息
nPass <- sum(tssSingles$cellCall==1)#filter后的细胞数,3770
nTotal <- sum(tssSingles$uniqueFrags >= filterFrags)#仅考虑片段数不考虑tss enrich的过滤后细胞数,7751
pdf("results/Filter-Cells.pdf")
ggplot(tssSingles[tssSingles$uniqueFrags > 500,], aes(x = log10(uniqueFrags), y = enrichment)) +
geom_hex(bins = 100) +
theme_bw() + scale_fill_viridis() +
xlab("log10 Unique Fragments") +
ylab("TSS Enrichment") +
geom_hline(yintercept = filterTSS, lty = "dashed") +
geom_vline(xintercept = log10(filterFrags), lty = "dashed") +
ggtitle(sprintf("Pass Rate : %s of %s (%s)", nPass, nTotal, round(100*nPass/nTotal,2)))
dev.off()
write.table(tssSingles, "results/Filter-Cells.txt")
#Filter,仅保留过滤后细胞的片段
fragments <- fragments[mcols(fragments)$RG %in% rownames(tssSingles)[tssSingles$cellCall==1]]
fragments$RG <- paste0(name,"#",fragments$RG)
#Save
saveRDS(fragments, out_fragments)
Filter_Cells.png
分步解析
insertionProfileSingles做了些什么
乍一看完整的代码会比较懵,我把作者包装的函数insertionProfileSingles拆开来一步步运行,看每一步到底做了些什么
#先看看涉及的一些对象长啥样:
insertions
GRanges object with 43347138 ranges and 1 metadata column:
seqnames ranges strand | RG
<Rle> <IRanges> <Rle> | <character>
[1] chr1 10073 * | TAGCCGGAGTGAATAC-1
[2] chr1 10086 * | TACCTATAGGACTTTC-1
[3] chr1 10227 * | TGACAACCATTGTTCT-1
[4] chr1 10236 * | ACAAAGAGTAACTCCA-1
[5] chr1 10376 * | GAGATTCCACATGATC-1
feature
GRanges object with 51385 ranges and 2 metadata columns:
seqnames ranges strand | tx_id tx_name
<Rle> <IRanges> <Rle> | <integer> <character>
[1] chr1 11874 + | 1 uc001aaa.3
[2] chr1 69091 + | 4 uc001aal.1
[3] chr1 321084 + | 5 uc001aaq.2
[4] chr1 321146 + | 6 uc001aar.2
[5] chr1 322037 + | 7 uc009vjk.2
center
GRanges object with 51385 ranges and 2 metadata columns:
seqnames ranges strand | tx_id tx_name
<Rle> <IRanges> <Rle> | <integer> <character>
[1] chr1 11874 + | 1 uc001aaa.3
[2] chr1 69091 + | 4 uc001aal.1
[3] chr1 321084 + | 5 uc001aaq.2
[4] chr1 321146 + | 6 uc001aar.2
[5] chr1 322037 + | 7 uc009vjk.2
overlap
DataFrame with 27778683 rows and 2 columns
queryHits subjectHits
<integer> <integer>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 5
... ... ...
27778679 49027 43345682
27778680 49027 43345683
27778681 49027 43345684
27778682 49027 43345685
27778683 49027 43345686
overlap$strand <- strand(center)[overlap[,1]]#overlap[,1]是第一列query,也就是feature的索引,获取overlap中这些feature的正负链信息
overlap$name <- mcols(insertions)[overlap[,2],]#overlap[,2]是第二列insertions的RG的索引,获取overlap中这些insertions的barcode(name)信息
overlap <- transform(overlap, id=match(name, unique(name)))
#添加id,不同overlap可能对应同一个cell barcode,因此是同一个id
> overlap
DataFrame with 27778683 rows and 5 columns
queryHits subjectHits strand name id
<integer> <integer> <Rle> <character> <integer>
1 1 1 + TAGCCGGAGTGAATAC-1 1
2 1 2 + TACCTATAGGACTTTC-1 2
3 1 3 + TGACAACCATTGTTCT-1 3
4 1 4 + ACAAAGAGTAACTCCA-1 4
5 1 5 + GAGATTCCACATGATC-1 5
... ... ... ... ... ...
27778679 49027 43345682 - CAACGTATCGCCCTTA-1 4603
27778680 49027 43345683 - CAGGGCTAGTACCACT-1 1172
27778681 49027 43345684 - CTCCCAACAGGATAGC-1 982
27778682 49027 43345685 - GTGGCGTTCAGTACAC-1 2850
27778683 49027 43345686 - CTCCCAACAGGATAGC-1 982
ids <- length(unique(overlap$name))
ids#总共有多少cell
[1] 21673
#distance 获取overlap计数
overlap$dist <- NA
minus <- which(overlap$strand == "-")
other <- which(overlap$strand != "-")
overlap$dist[minus] <- start(center[overlap[minus,1]]) - start(insertions[overlap[minus,2]])#如果feature在负链上,就用feature的起点减去insertions的起点作为距离
overlap$dist[other] <- start(insertions[overlap[other,2]]) - start(center[overlap[other,1]])#反之,距离就是insertions减去feature,也就是获得的片段距离tss的长度
#Insertion Mat 转换成插入片段矩阵,每一列为barcode ids,每一行为每个overlap里的的片段数量,对列求和应该就是每个细胞的总的与feature有重叠的counts数
profile_mat <- tabulate2dCpp(x1 = overlap$id, y1 = overlap$dist, xmin = 1, xmax = ids, ymin = -2000, ymax = 2000)
colnames(profile_mat) <- unique(overlap$name)
profile <- rowSums(profile_mat)
> str(profile_mat)
int [1:4001, 1:21673] 0 0 1 0 0 0 0 0 0 0 ...
> profile_mat[1:4,1:4]
[,1] [,2] [,3] [,4]
[1,] 0 0 0 0
[2,] 0 0 1 0
[3,] 1 0 2 0
[4,] 0 0 0 0
#矩阵的每一列对应一个细胞,每一行对应的应该是所有tss附近的overlap计数,一共在tss附近统计了±2,000 bp
#,越远离tss应该0值越多,这是我自己推测的,感兴趣的可以去看看tabulate2dCpp这个函数的c代码
head(profile)#profile就是每一个tss附近在所有细胞中有多少overlap
[1] 2190 2060 2256 2247 2275 2127
#normalize
profile_mat_norm <- apply(profile_mat, 2, function(x) x/max(mean(x[c(1:norm,(flank*2-norm+1):(flank*2+1))]), 0.5)) #Handles low depth cells,意思是对每一个细胞取前100和后100个tss的counts,如果平均counts数低于0.5就把每一列除以0.5,如果平均counts数大于0.5就每一列除以平均counts数,减少每个细胞文库大小不均的影响
profile_norm <- profile/mean(profile[c(1:norm,(flank*2-norm+1):(flank*2+1))])#把tss的富集丰度变成小数
head(profile_norm)
[1] 1.137927 1.070379 1.172221 1.167544 1.182093 1.105192
#smooth 求滑动平均数,可以使序列变平滑,将N个连续的时间序列成员作为一个集合,计算该集合的平均值,并逐项推移该集合
profile_mat_norm_smooth <- apply(profile_mat_norm, 2, function(x) zoo::rollmean(x, smooth, fill = 1))#以smooth=51为滑动窗口长度,fill的意思是前25个滑动平均数都用1填充
profile_norm_smooth <- zoo::rollmean(profile_norm, smooth, fill = 1) #因此,profile_mat_norm_smooth反映了每个细胞的tss富集情况,profile_norm_smooth反映了所有细胞的tss的富集情况
#enrichment
max_finite <- function(x){
suppressWarnings(max(x[is.finite(x)], na.rm=TRUE))
}
e_mat <- apply(profile_mat_norm_smooth, 2, function(x) max_finite(x[(flank-range):(flank+range)]))#取1900-2100范围的tss富集情况
names(e_mat) <- colnames(profile_mat_norm_smooth)
e <- max_finite(profile_norm_smooth[(flank-range):(flank+range)]) #Summary
df_mat <- data.frame(
enrichment = e_mat,
insertions = as.vector(table(mcols(insertions)[,by])[names(e_mat)]),#这里的insertions是每个细胞fragments数目
insertionsWindow = as.vector(table(overlap$name)[names(e_mat)])
)#这个是每个细胞overlap的数目
df_sum <- data.frame(bp = (-flank):flank, profile = profile, norm_profile = profile_norm, smooth_norm_profile = profile_norm_smooth, enrichment = e)
rownames(df_sum) <- NULL
return(list(df = df_sum, dfall = df_mat, profileMat = profile_mat_norm, profileMatSmooth = profile_mat_norm_smooth))
}
head(df_sum)
bp profile norm_profile smooth_norm_profile enrichment
1 -2000 2190 1.137927 1 17.91299
2 -1999 2060 1.070379 1 17.91299
3 -1998 2256 1.172221 1 17.91299
4 -1997 2247 1.167544 1 17.91299
5 -1996 2275 1.182093 1 17.91299
6 -1995 2127 1.105192 1 17.91299
#可以看到在远离tss的地方,profile值(所有细胞在该位点总的counts)只有2000左右,当然smooth也低
df_sum[c(seq(1995,2005)),]#但是tss附近profile值很高,smooth也是。enrichment是smooth_norm_profile的最大值,所有细胞总的tss的最大富集分数
bp profile norm_profile smooth_norm_profile enrichment
1995 -6 32300 16.78312 16.90412 17.91299
1996 -5 31558 16.39758 16.88616 17.91299
1997 -4 29566 15.36253 16.85958 17.91299
1998 -3 30395 15.79328 16.82941 17.91299
1999 -2 32723 17.00292 16.79287 17.91299
2000 -1 35846 18.62563 16.74796 17.91299
2001 0 36234 18.82724 16.69909 17.91299
2002 1 33384 17.34637 16.65906 17.91299
2003 2 31181 16.20169 16.62594 17.91299
2004 3 30488 15.84161 16.59353 17.91299
2005 4 32274 16.76961 16.56338 17.91299
批量运行
测试数据只有2万多个细胞,所以我像上面那样直接运行问题可能不大,实际数据量可能很大(几十万个细胞),所以考虑用batch进行分批处理的方法,每次处理100个细胞,最后把结果合并。注意作者包装的函数insertionProfileSingles里面还嵌套了一个函数insertionProfileSingles_helper,所以正式运行的话,外函数里运行的是下面的部分
uniqueTags <- as.character(unique(mcols(fragments)[,by]))
splitTags <- split(uniqueTags, ceiling(seq_along(uniqueTags)/batchSize))
#ceiling向上取整,把所有unique barcode的序号除以100取整,每100个barcode作为一个group,一共217个group
pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)#展示进度条
batchTSS <- lapply(seq_along(splitTags), function(x){
setTxtProgressBar(pb, round(x * 100/length(splitTags), 0))#相当于每完成一个循环就进度条加1
profilex <- insertionProfileSingles_helper(#之前的操作都是定义在insertionProfileSingles_helper这个函数里面,这里才正式传参运行,每次只提取100个细胞的fragments
feature=feature,
fragments=fragments[which(mcols(fragments)[,by] %in% splitTags[[x]])],
by = by,
getInsertions = getInsertions,
fix = fix,
flank = flank,
norm = norm,
smooth = smooth,
range = range
)
return(profilex)
})
df <- lapply(batchTSS, function(x) x$df) %>% Reduce("rbind",.)#把每个batch运行的结果合并
dfall <- lapply(batchTSS, function(x) x$dfall) %>% Reduce("rbind",.)#dfall就是df_mat
profileMat <- lapply(batchTSS, function(x) x$profileMat) %>% Reduce("cbind",.)
profileMatSmooth <- lapply(batchTSS, function(x) x$profileMatSmooth) %>% Reduce("cbind",.)
return(list(df = df, dfall = dfall, profileMat = profileMat, profileMatSmooth = profileMatSmooth))
}