scATAC_Code

对单细胞测序感兴趣的可以先参考下面的课程：
Analysis of single cell RNA-seq data：https://scrnaseq-course.cog.sanger.ac.uk/website/index.html
先验知识包括：
scATAC建库
 scATAC数据分析流程（上）
scATAC数据分析流程（下）
下游处理一共有8个steps，每个step我都会在代码块里写上详细的注释
代码和测试数据在：https://github.com/greenleaflab/10x-scatac-2019

每个step的结构是：先把完整的代码运行一遍，然后再分步分析每一步做了什么

Seurat的scATAC-seq分析流程：https://blog.csdn.net/u012110870/article/details/100171472

Step1-Filtering Cells based on TSS enrichment and unique fragments

完整代码

用到的函数

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(magrittr)
library(ggplot2)
library(Rcpp)
library(viridis)
#--------------------------------------------
# Functions
#--------------------------------------------

sourceCpp(code='
  #include <Rcpp.h>

  using namespace Rcpp;
  using namespace std;

  // [[Rcpp::export]]
  IntegerMatrix tabulate2dCpp(IntegerVector x1, int xmin, int xmax, IntegerVector y1, int ymin, int ymax){
    if(x1.size() != y1.size()){
      stop("width must equal size!");
    }
    IntegerVector x = clone(x1);
    IntegerVector y = clone(y1);
    int n = x.size();
    IntegerVector rx = seq(xmin,xmax);
    IntegerVector ry = seq(ymin,ymax);
    IntegerMatrix mat( ry.size() , rx.size() );
    int xi,yi;
    for(int i = 0; i < n; i++){
      xi = (x[i] - xmin);
      yi = (y[i] - ymin);
      if(yi >= 0 && yi < ry.size()){
        if(xi >= 0 && xi < rx.size()){
          mat( yi , xi ) = mat( yi , xi ) + 1; 
        }
      }
    }
    return mat;
  }'
)

insertionProfileSingles <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){
  
  insertionProfileSingles_helper <- function(feature, fragments, by = "RG", getInsertions = TRUE, fix = "center", flank = 2000, norm = 100, smooth = 51, range = 100, batchSize = 100){
    #Convert To Insertion Sites
    if(getInsertions){
        insertions <- c(
          GRanges(seqnames = seqnames(fragments), ranges = IRanges(start(fragments), start(fragments)), RG = mcols(fragments)[,by]),
          GRanges(seqnames = seqnames(fragments), ranges = IRanges(end(fragments), end(fragments)), RG = mcols(fragments)[,by])
        )
        by <- "RG"
    }else{
      insertions <- fragments
    }
    remove(fragments)
    gc()

    #center the feature
    center <- unique(resize(feature, width = 1, fix = fix, ignore.strand = FALSE))
    
    #get overlaps between the feature and insertions only up to flank bp,这里maxgap设置只要对象间距离小于flank bp即认为有overlap，默认设置为2000bp，也是文中作者采用的参数
    overlap <- DataFrame(findOverlaps(query = center, subject = insertions, maxgap = flank, ignore.strand = TRUE))
    overlap$strand <- strand(center)[overlap[,1]]
    overlap$name <- mcols(insertions)[overlap[,2],by]
    overlap <- transform(overlap, id=match(name, unique(name)))
    ids <- length(unique(overlap$name))
    #add unique identity (ID) (integer) cell barcode to the overlaps object

    #distance   
    overlap$dist <- NA
    minus <- which(overlap$strand == "-")
    other <- which(overlap$strand != "-")
    overlap$dist[minus] <- start(center[overlap[minus,1]]) - start(insertions[overlap[minus,2]])
    overlap$dist[other] <- start(insertions[overlap[other,2]]) - start(center[overlap[other,1]])

    #Insertion Mat  转换成插入片段矩阵
    profile_mat <- tabulate2dCpp(x1 = overlap$id, y1 = overlap$dist, xmin = 1, xmax = ids, ymin = -flank, ymax = flank)
    colnames(profile_mat) <- unique(overlap$name)
    profile <- rowSums(profile_mat)

    #normalize  
    profile_mat_norm <- apply(profile_mat, 2, function(x) x/max(mean(x[c(1:norm,(flank*2-norm+1):(flank*2+1))]), 0.5)) #Handles low depth cells
    profile_norm <- profile/mean(profile[c(1:norm,(flank*2-norm+1):(flank*2+1))])

    #smooth   滑动平均数
    profile_mat_norm_smooth <- apply(profile_mat_norm, 2, function(x) zoo::rollmean(x, smooth, fill = 1))
    profile_norm_smooth <- zoo::rollmean(profile_norm, smooth, fill = 1)

    #enrichment
    max_finite <- function(x){
      suppressWarnings(max(x[is.finite(x)], na.rm=TRUE))
    }
    e_mat <- apply(profile_mat_norm_smooth, 2, function(x) max_finite(x[(flank-range):(flank+range)]))
    names(e_mat) <- colnames(profile_mat_norm_smooth)
    e <- max_finite(profile_norm_smooth[(flank-range):(flank+range)])

    #Summary
    df_mat <- data.frame(
      enrichment = e_mat,
      insertions = as.vector(table(mcols(insertions)[,by])[names(e_mat)]),
      insertionsWindow = as.vector(table(overlap$name)[names(e_mat)])
      )
    df_sum <- data.frame(bp = (-flank):flank, profile = profile, norm_profile = profile_norm, smooth_norm_profile = profile_norm_smooth, enrichment = e)
    rownames(df_sum) <-  NULL

    return(list(df = df_sum, dfall = df_mat, profileMat = profile_mat_norm, profileMatSmooth = profile_mat_norm_smooth))
  }
  
  uniqueTags <- as.character(unique(mcols(fragments)[,by]))
  splitTags <- split(uniqueTags, ceiling(seq_along(uniqueTags)/batchSize))
  
  pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)
  batchTSS <- lapply(seq_along(splitTags), function(x){
    setTxtProgressBar(pb, round(x * 100/length(splitTags), 0))
    profilex <- insertionProfileSingles_helper(
        feature=feature, 
        fragments=fragments[which(mcols(fragments)[,by] %in% splitTags[[x]])], 
        by = by, 
        getInsertions = getInsertions,
        fix = fix, 
        flank = flank, 
        norm = norm, 
        smooth = smooth, 
        range = range
      )

    return(profilex)
  })
  df <- lapply(batchTSS, function(x) x$df) %>% Reduce("rbind",.)
  dfall <- lapply(batchTSS, function(x) x$dfall) %>% Reduce("rbind",.)
  profileMat <- lapply(batchTSS, function(x) x$profileMat) %>% Reduce("cbind",.)
  profileMatSmooth <- lapply(batchTSS, function(x) x$profileMatSmooth) %>% Reduce("cbind",.)
  return(list(df = df, dfall = dfall, profileMat = profileMat, profileMatSmooth = profileMatSmooth))
}

输入fragments文件、运行

#--------------------------------------------
# Input
#--------------------------------------------
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
minFrags <- 100
filterFrags <- 1000
filterTSS <- 8
file_fragments <- "data/PBMC_10x-Sub25M-fragments.tsv.gz"
out_fragments <- "data/PBMC_10x-Sub25M-fragments.gr.rds"
name <- "PBMC"

#-----------------
# Reading Fragment Files
#-----------------
message("Reading in fragment files...")
fragments <- data.frame(readr::read_tsv(file_fragments, col_names=FALSE))
#fragmentSub <- fragments[sample(seq_len(nrow(fragments)),25*10^6),]
#write.table(fragmentSub, "data/PBMC_10x-Sub25M-fragments.tsv.gz", col.names=FALSE, row.names =FALSE, sep = "\t", quote = FALSE)

#因为数据很大，读取需要一段时间
Parsed with column specification:
cols(
  X1 = col_character(),
  X2 = col_double(),
  X3 = col_double(),
  X4 = col_character(),
  X5 = col_double()
)
|==============================================================| 100% 1070 MB
fragments <- GRanges(
  seqnames = fragments[,1], 
  IRanges(fragments[,2]+1, fragments[,3]), 
  RG = fragments[,4], 
  N = fragments[,5]
  )#先把读入的片段转成GRanges对象

message("Filtering Lowly Represented Cells...")
tabRG <- table(fragments$RG)
keep <- names(tabRG)[which(tabRG >= minFrags)]#要求每个细胞的片段数至少100个，这是降采样的reads，所以不要求很多
fragments <- fragments[fragments$RG %in% keep,]#过滤以后从25M降低至约21M
fragments <- sort(sortSeqlevels(fragments))

#-----------------
# TSS Profile
#-----------------
feature <- txdb %>% transcripts(.) %>% resize(., width = 1, fix = "start") %>% unique#选取的feature为transcripts,并归一化至1bp的起始位点也就是tss
tssProfile <- insertionProfileSingles(feature = feature, fragments = fragments, 
  getInsertions = TRUE, batchSize = 1000)
 
tssSingles <- tssProfile$dfall
tssSingles$uniqueFrags <- 0#创建一栏标记每个细胞有多少fragments
tssSingles[names(tabRG),"uniqueFrags"] <- tabRG
#统计每个细胞的uniqueFrags数，后续筛选
head(tssSingles)
                   enrichment insertions insertionsWindow uniqueFrags
TAGCCGGAGTGAATAC-1   6.196078       3238             2164        1619
TACCTATAGGACTTTC-1   6.470588       2738             1906        1369
TGACAACCATTGTTCT-1   8.274510       3836             3136        1918
ACAAAGAGTAACTCCA-1   6.117647       3840             2411        1920
GAGATTCCACATGATC-1   1.529412        504              408         252
TGATCAGAGGAAGGTA-1   3.058824       2850             1219        1425
tssSingles$cellCall <- 0#We then filtered all scATAC-seq profiles to keep those that had at least 1,000 unique fragments and a TSS enrichment of 8.

tssSingles$cellCall[tssSingles$uniqueFrags >= filterFrags & tssSingles$enrichment >= filterTSS] <- 1

可视化

#-----------------
# Plot Stats
#-----------------
tssSingles <- tssSingles[complete.cases(tssSingles),]#complete.cases返回数据是否缺失的信息
nPass  <- sum(tssSingles$cellCall==1)#filter后的细胞数，3770
nTotal <- sum(tssSingles$uniqueFrags >= filterFrags)#仅考虑片段数不考虑tss enrich的过滤后细胞数，7751

pdf("results/Filter-Cells.pdf")
ggplot(tssSingles[tssSingles$uniqueFrags > 500,], aes(x = log10(uniqueFrags), y = enrichment)) +
  geom_hex(bins = 100) +
  theme_bw() + scale_fill_viridis() +
  xlab("log10 Unique Fragments") +
  ylab("TSS Enrichment") +
  geom_hline(yintercept = filterTSS, lty = "dashed") +
  geom_vline(xintercept = log10(filterFrags), lty = "dashed") +
  ggtitle(sprintf("Pass Rate : %s of %s (%s)", nPass, nTotal, round(100*nPass/nTotal,2)))
dev.off()

write.table(tssSingles, "results/Filter-Cells.txt")

#Filter，仅保留过滤后细胞的片段
fragments <- fragments[mcols(fragments)$RG %in% rownames(tssSingles)[tssSingles$cellCall==1]]
fragments$RG <- paste0(name,"#",fragments$RG)

#Save
saveRDS(fragments, out_fragments)

Filter_Cells.png

分步解析

insertionProfileSingles做了些什么

乍一看完整的代码会比较懵，我把作者包装的函数insertionProfileSingles拆开来一步步运行，看每一步到底做了些什么

#先看看涉及的一些对象长啥样：
insertions
GRanges object with 43347138 ranges and 1 metadata column:
             seqnames    ranges strand |                 RG
                <Rle> <IRanges>  <Rle> |        <character>
         [1]     chr1     10073      * | TAGCCGGAGTGAATAC-1
         [2]     chr1     10086      * | TACCTATAGGACTTTC-1
         [3]     chr1     10227      * | TGACAACCATTGTTCT-1
         [4]     chr1     10236      * | ACAAAGAGTAACTCCA-1
         [5]     chr1     10376      * | GAGATTCCACATGATC-1
feature
GRanges object with 51385 ranges and 2 metadata columns:
                seqnames    ranges strand |     tx_id     tx_name
                   <Rle> <IRanges>  <Rle> | <integer> <character>
      [1]           chr1     11874      + |         1  uc001aaa.3
      [2]           chr1     69091      + |         4  uc001aal.1
      [3]           chr1    321084      + |         5  uc001aaq.2
      [4]           chr1    321146      + |         6  uc001aar.2
      [5]           chr1    322037      + |         7  uc009vjk.2
center
GRanges object with 51385 ranges and 2 metadata columns:
                seqnames    ranges strand |     tx_id     tx_name
                   <Rle> <IRanges>  <Rle> | <integer> <character>
      [1]           chr1     11874      + |         1  uc001aaa.3
      [2]           chr1     69091      + |         4  uc001aal.1
      [3]           chr1    321084      + |         5  uc001aaq.2
      [4]           chr1    321146      + |         6  uc001aar.2
      [5]           chr1    322037      + |         7  uc009vjk.2
overlap
DataFrame with 27778683 rows and 2 columns
         queryHits subjectHits
         <integer>   <integer>
1                1           1
2                1           2
3                1           3
4                1           4
5                1           5
...            ...         ...
27778679     49027    43345682
27778680     49027    43345683
27778681     49027    43345684
27778682     49027    43345685
27778683     49027    43345686
overlap$strand <- strand(center)[overlap[,1]]#overlap[,1]是第一列query，也就是feature的索引，获取overlap中这些feature的正负链信息
overlap$name <- mcols(insertions)[overlap[,2],]#overlap[,2]是第二列insertions的RG的索引，获取overlap中这些insertions的barcode(name)信息
overlap <- transform(overlap, id=match(name, unique(name)))
#添加id,不同overlap可能对应同一个cell barcode，因此是同一个id
> overlap
DataFrame with 27778683 rows and 5 columns
         queryHits subjectHits strand               name        id
         <integer>   <integer>  <Rle>        <character> <integer>
1                1           1      + TAGCCGGAGTGAATAC-1         1
2                1           2      + TACCTATAGGACTTTC-1         2
3                1           3      + TGACAACCATTGTTCT-1         3
4                1           4      + ACAAAGAGTAACTCCA-1         4
5                1           5      + GAGATTCCACATGATC-1         5
...            ...         ...    ...                ...       ...
27778679     49027    43345682      - CAACGTATCGCCCTTA-1      4603
27778680     49027    43345683      - CAGGGCTAGTACCACT-1      1172
27778681     49027    43345684      - CTCCCAACAGGATAGC-1       982
27778682     49027    43345685      - GTGGCGTTCAGTACAC-1      2850
27778683     49027    43345686      - CTCCCAACAGGATAGC-1       982
ids <- length(unique(overlap$name))
ids#总共有多少cell
[1] 21673
#distance   获取overlap计数
    overlap$dist <- NA
    minus <- which(overlap$strand == "-")
    other <- which(overlap$strand != "-")
    overlap$dist[minus] <- start(center[overlap[minus,1]]) - start(insertions[overlap[minus,2]])#如果feature在负链上，就用feature的起点减去insertions的起点作为距离
    overlap$dist[other] <- start(insertions[overlap[other,2]]) - start(center[overlap[other,1]])#反之，距离就是insertions减去feature,也就是获得的片段距离tss的长度

#Insertion Mat  转换成插入片段矩阵，每一列为barcode ids，每一行为每个overlap里的的片段数量，对列求和应该就是每个细胞的总的与feature有重叠的counts数
profile_mat <- tabulate2dCpp(x1 = overlap$id, y1 = overlap$dist, xmin = 1, xmax = ids, ymin = -2000, ymax = 2000)
colnames(profile_mat) <- unique(overlap$name)
profile <- rowSums(profile_mat)
> str(profile_mat)
 int [1:4001, 1:21673] 0 0 1 0 0 0 0 0 0 0 ...
> profile_mat[1:4,1:4]
     [,1] [,2] [,3] [,4]
[1,]    0    0    0    0
[2,]    0    0    1    0
[3,]    1    0    2    0
[4,]    0    0    0    0
#矩阵的每一列对应一个细胞，每一行对应的应该是所有tss附近的overlap计数，一共在tss附近统计了±2,000 bp
#，越远离tss应该0值越多，这是我自己推测的，感兴趣的可以去看看tabulate2dCpp这个函数的c代码
head(profile)#profile就是每一个tss附近在所有细胞中有多少overlap
[1] 2190 2060 2256 2247 2275 2127
#normalize  
profile_mat_norm <- apply(profile_mat, 2, function(x) x/max(mean(x[c(1:norm,(flank*2-norm+1):(flank*2+1))]), 0.5)) #Handles low depth cells，意思是对每一个细胞取前100和后100个tss的counts，如果平均counts数低于0.5就把每一列除以0.5，如果平均counts数大于0.5就每一列除以平均counts数，减少每个细胞文库大小不均的影响
profile_norm <- profile/mean(profile[c(1:norm,(flank*2-norm+1):(flank*2+1))])#把tss的富集丰度变成小数
head(profile_norm)
[1] 1.137927 1.070379 1.172221 1.167544 1.182093 1.105192
#smooth   求滑动平均数，可以使序列变平滑，将N个连续的时间序列成员作为一个集合，计算该集合的平均值，并逐项推移该集合
profile_mat_norm_smooth <- apply(profile_mat_norm, 2, function(x) zoo::rollmean(x, smooth, fill = 1))#以smooth=51为滑动窗口长度，fill的意思是前25个滑动平均数都用1填充
profile_norm_smooth <- zoo::rollmean(profile_norm, smooth, fill = 1)                     #因此，profile_mat_norm_smooth反映了每个细胞的tss富集情况，profile_norm_smooth反映了所有细胞的tss的富集情况  
#enrichment
    max_finite <- function(x){
      suppressWarnings(max(x[is.finite(x)], na.rm=TRUE))
    }
    e_mat <- apply(profile_mat_norm_smooth, 2, function(x) max_finite(x[(flank-range):(flank+range)]))#取1900-2100范围的tss富集情况
    names(e_mat) <- colnames(profile_mat_norm_smooth)
    e <- max_finite(profile_norm_smooth[(flank-range):(flank+range)])                     #Summary
    df_mat <- data.frame(
      enrichment = e_mat,
      insertions = as.vector(table(mcols(insertions)[,by])[names(e_mat)]),#这里的insertions是每个细胞fragments数目
      insertionsWindow = as.vector(table(overlap$name)[names(e_mat)])
    )#这个是每个细胞overlap的数目
    df_sum <- data.frame(bp = (-flank):flank, profile = profile, norm_profile = profile_norm, smooth_norm_profile = profile_norm_smooth, enrichment = e)
    rownames(df_sum) <-  NULL
    
    return(list(df = df_sum, dfall = df_mat, profileMat = profile_mat_norm, profileMatSmooth = profile_mat_norm_smooth))
  }    
head(df_sum)
     bp profile norm_profile smooth_norm_profile enrichment
1 -2000    2190     1.137927                   1   17.91299
2 -1999    2060     1.070379                   1   17.91299
3 -1998    2256     1.172221                   1   17.91299
4 -1997    2247     1.167544                   1   17.91299
5 -1996    2275     1.182093                   1   17.91299
6 -1995    2127     1.105192                   1   17.91299
#可以看到在远离tss的地方，profile值（所有细胞在该位点总的counts)只有2000左右，当然smooth也低
df_sum[c(seq(1995,2005)),]#但是tss附近profile值很高，smooth也是。enrichment是smooth_norm_profile的最大值，所有细胞总的tss的最大富集分数
     bp profile norm_profile smooth_norm_profile enrichment
1995 -6   32300     16.78312            16.90412   17.91299
1996 -5   31558     16.39758            16.88616   17.91299
1997 -4   29566     15.36253            16.85958   17.91299
1998 -3   30395     15.79328            16.82941   17.91299
1999 -2   32723     17.00292            16.79287   17.91299
2000 -1   35846     18.62563            16.74796   17.91299
2001  0   36234     18.82724            16.69909   17.91299
2002  1   33384     17.34637            16.65906   17.91299
2003  2   31181     16.20169            16.62594   17.91299
2004  3   30488     15.84161            16.59353   17.91299
2005  4   32274     16.76961            16.56338   17.91299

批量运行

测试数据只有2万多个细胞，所以我像上面那样直接运行问题可能不大，实际数据量可能很大（几十万个细胞），所以考虑用batch进行分批处理的方法，每次处理100个细胞，最后把结果合并。注意作者包装的函数insertionProfileSingles里面还嵌套了一个函数insertionProfileSingles_helper，所以正式运行的话，外函数里运行的是下面的部分

uniqueTags <- as.character(unique(mcols(fragments)[,by]))
  splitTags <- split(uniqueTags, ceiling(seq_along(uniqueTags)/batchSize))
  #ceiling向上取整，把所有unique barcode的序号除以100取整，每100个barcode作为一个group，一共217个group
  pb <- txtProgressBar(min = 0, max = 100, initial = 0, style = 3)#展示进度条
  batchTSS <- lapply(seq_along(splitTags), function(x){
    setTxtProgressBar(pb, round(x * 100/length(splitTags), 0))#相当于每完成一个循环就进度条加1
    profilex <- insertionProfileSingles_helper(#之前的操作都是定义在insertionProfileSingles_helper这个函数里面，这里才正式传参运行，每次只提取100个细胞的fragments
      feature=feature, 
      fragments=fragments[which(mcols(fragments)[,by] %in% splitTags[[x]])], 
      by = by, 
      getInsertions = getInsertions,
      fix = fix, 
      flank = flank, 
      norm = norm, 
      smooth = smooth, 
      range = range
    )
    
    return(profilex)
  })
  df <- lapply(batchTSS, function(x) x$df) %>% Reduce("rbind",.)#把每个batch运行的结果合并
  dfall <- lapply(batchTSS, function(x) x$dfall) %>% Reduce("rbind",.)#dfall就是df_mat
  profileMat <- lapply(batchTSS, function(x) x$profileMat) %>% Reduce("cbind",.)
  profileMatSmooth <- lapply(batchTSS, function(x) x$profileMatSmooth) %>% Reduce("cbind",.)
  return(list(df = df, dfall = dfall, profileMat = profileMat, profileMatSmooth = profileMatSmooth))
}

引用：[https://doi.org/10.1038/s41587-019-0206-z]

scATAC分析实战（step1）