单细胞练习1

数据获取：Transcriptional Heterogeneity in Naive and Primed Human Pluripotent Stem Cells at Single-Cell Resolution

代码获取：GitHub - MarioniLab/NaiveHESC2016: Code for Tobias, Ferdinand and Aaron's naive/primed hESC project.

处理人Naive，Primed状态干细胞的单细胞数据，观察其异质性，不同表达情况等等。根据对不同状态的干细胞数据的分析以及根据经验作者筛选出了一些基因，并将这些基因作为Naive-Primed的转化轴，并结合了小鼠，食蟹猴，人类的植入前胚胎发育情况进行了对比分析，观察这些基因在胚胎发育过程中的表达情况

数据处理，批量读入2383，2677，2384，2678，2739，2740数据并将其存放在all.counts对象内

setwd('E:\\tmp')
library(edgeR)
#创建空列表
all.counts <- list()
gene.names <- NULL
gene.length <- NULL
#对不同批次的矩阵进行读入
for (sample in c("2383", "2384", "2677", "2678", "2739", "2740")){
  cur.file <- list.files("E:\\tmp", pattern=paste0("genic_counts_", sample), full=TRUE)
  tmp = list.files('E:/tmp',full.names = T)
  current_counts <- read.table(cur.file, sep="\t", header=TRUE, row.names=1)
  # 检查不同文件基因名与基因长度是否相同
  if (is.null(gene.names)){
    gene.names <- rownames(current_counts)
    gene.length <- current_counts$Length
  } else {
    #数据一致性检测
    stopifnot(identical(gene.names, rownames(current_counts)))
    stopifnot(identical(gene.length, current_counts$Length))
  }
  current_counts$Length <- NULL
  # 融合技术重复，以及更改列名，^开头，$结尾，[ ]可替换数字
  cellname <- colnames(current_counts)
  #sub（）将后者''''替换为前者
  cellname <- sub("^lane[0-9]_", "", cellname)
  cellname <- sub("_L00[0-9]_", "_", cellname)
  cellname <- sub("_[12]$", "", cellname)
  colnames(current_counts) <- cellname
  if (any(duplicated(colnames(current_counts)))) {
    current_counts <- sumTechReps(current_counts)
    gc()
  }
  # 将读入矩阵添加到空列表中
  all.counts[[sample]] <- as.matrix(current_counts)
} 
sapply(all.counts, ncol)

将技术重复矩阵融合2677 和 2678, 2739 和 2740

stopifnot(identical(colnames(all.counts[["2677"]]), colnames(all.counts[["2678"]])))
all.counts[["2677"]] <- all.counts[["2677"]] + all.counts[["2678"]]
all.counts[["2678"]] <- NULL
stopifnot(identical(colnames(all.counts[["2739"]]), colnames(all.counts[["2740"]])))
all.counts[["2739"]] <- all.counts[["2739"]] + all.counts[["2740"]]
all.counts[["2740"]] <- NULL
sapply(all.counts, ncol)

随后将矩阵融合，并将批次信息注释于细胞名前

for (sample in names(all.counts)) {
  current <- all.counts[[sample]]
  colnames(current) <- paste0(sample, ".", colnames(current))
  all.counts[[sample]] <- current
}
#对象内数据融合到combined.counts
combined.counts <- do.call(cbind, all.counts)
dim(combined.counts)
rm(all.counts)
gc()

bioMart进行Gene ID转换以及染色体定位，基因类型的注释

library(BiocFileCache)
bmcache <- BiocFileCache("biomart", ask = FALSE)
loc <- bfcquery(bmcache, "hg38.ensGene", exact=TRUE)$rpath
if (length(loc)==0L) {
  library(biomaRt)
  ensembl <- useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset="hsapiens_gene_ensembl",
                    host="aug2017.archive.ensembl.org") # Ensembl version 90.
  ensemblGenes <- getBM(attributes=c('ensembl_gene_id',  'chromosome_name', 'gene_biotype',
                                    'external_gene_name', 'entrezgene'), filters="ensembl_gene_id",
                        values=gene.names, mart=ensembl)
  saveRDS(ensemblGenes, file=bfcnew(bmcache, "hg38.ensGene"))
} else {
  ensemblGenes <- readRDS(loc)
}

match注释信息和表达矩阵的顺序

features <- ensemblGenes[match(gene.names, ensemblGenes$ensembl_gene_id),]
features$ensembl_gene_id <- gene.names
features$entrezgene <- gsub(" ", "", as.character(features$entrezgene))
row.names(features) <- gene.names
features$Length <- gene.length
head(features)

提取线粒体与spike in基因到is.mito,is.spike

is.mito <- !is.na(features$chromosome_name) & features$chromosome_name=="MT"
summary(is.mito)
#按指定关键字搜索字符
is.spike <- grepl("^ERCC", gene.names)
summary(is.spike)

添加样本注释信息，批次信息
2383,2384->第一批次
2383,2677,2678->Naive

sample <- sub("^([0-9]+)\\..*", "\\1", colnames(combined.counts))
phenotype <- character(ncol(combined.counts))
phenotype[sample %in% c("2383", "2677", "2678")] <- "naive"
phenotype[sample %in% c("2384", "2739", "2740")] <- "primed"
table(phenotype)
batch <- character(ncol(combined.counts))
batch[sample %in% c("2383", "2384")] <- "1"
batch[sample %in% c("2677", "2678", "2739", "2740")] <- "2"
table(batch)

汇总注释信息到pheno数据框

pheno <- data.frame(phenotype, batch, sample, row.names = colnames(combined.counts))
head(pheno)

创建SingleCellExperiment对象，并添加spike in信息

library(SingleCellExperiment)
sce <- SingleCellExperiment(list(counts=as.matrix(combined.counts)),
                            rowData=features, colData=pheno)
isSpike(sce, "ERCC") <- is.spike
sce

载入scater，去除重复的symbol id

library(scater)
new.row.names <- uniquifyFeatureNames(
  rowData(sce)$ensembl_gene_id,
  rowData(sce)$external_gene_name
)
rownames(sce) <- new.row.names
head(rownames(sce), 20)

calculateQCMetrics数据质控，并将feature_controls中添加spike-in基因以及MT基因信息

sce <- calculateQCMetrics(sce, feature_controls=list(Mt=is.mito))
colnames(colData(sce))

根据总counts数，ERCC基因表达量，检测到表达的基因数量，线粒体基因表达量画小提琴图，根据条件过滤

multiplot(cols=2,
          plotColData(sce, x="sample", y="log10_total_counts"),
          plotColData(sce, x="sample", y="total_features_by_counts"),
          plotColData(sce, x="sample", y="pct_counts_ERCC"),
          plotColData(sce, x="sample", y="pct_counts_Mt")
)

各项指标下的小提琴图.png

根据文库，基因数量，线粒体基因以及spike in筛选离群细胞

libsize.drop <- isOutlier(sce$total_counts, nmads=3, type="lower", log=TRUE, batch=sce$sample) 
feature.drop <- isOutlier(sce$total_features_by_counts, nmads=3, type="lower", log=TRUE, batch=sce$sample) 
mito.drop <- isOutlier(sce$pct_counts_Mt, nmads=3, type="higher", batch=sce$sample)
spike.drop <- isOutlier(sce$pct_counts_ERCC, nmads=3, type="higher", batch=sce$sample)
discard <- libsize.drop | feature.drop | spike.drop | mito.drop
data.frame(ByLibSize=sum(libsize.drop), ByFeature=sum(feature.drop), ByMito=sum(mito.drop),
           BySpike=sum(spike.drop), Remaining.in.batch=sum(!discard))

筛选出离群样本之后，作者又为了确保不会丢弃隐藏在离群细胞的特定细胞类型。为此，我们研究保留的细胞和过滤掉的细胞之间基因表达的差异。

suppressWarnings({
lost <- calcAverage(counts(sce)[,discard])
kept <- calcAverage(counts(sce)[,!discard])
})
log.vals <- edgeR::cpm(cbind(lost, kept), log=TRUE, prior=1)
logfc <- log.vals[,1] - log.vals[,2]
head(sort(logfc, decreasing=TRUE), 20)
# Avoid loss of points when either average is zero.
lost <- pmax(lost, min(lost[lost > 0]))
kept <- pmax(kept, min(kept[kept > 0]))
plot(lost, kept, xlab="Average count (discarded)", 
ylab="Average count (retained)", log="xy", pch=16)
is.spike <- isSpike(sce)
points(lost[is.spike], kept[is.spike], col="red", pch=16)
is.mito <- rowData(sce)$is_feature_control_Mt
points(lost[is.mito], kept[is.mito], col="dodgerblue", pch=16)

保留的细胞与过滤细胞的基因表达差异.png

随后从sce中删除离群细胞

sce <- sce[, !discard]
dim(sce)

之后利用scran确定细胞周期影响

library(scran)
hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
set.seed(10000)
assigned <- cyclone(sce, pairs=hs.pairs, gene.names=rowData(sce)$ensembl_gene_id)
write.table(file=file.path("phases.tsv"), assigned, row.names=FALSE, sep="\t", quote=TRUE)
head(assigned$scores)

画图查看G1，G1，S期细胞分布

par(mfrow=c(1,2))
plot(assigned$scores$G1, assigned$scores$G2M, xlab="G1 score", ylab="G2/M score", pch=16)
plot(assigned$scores$G1, assigned$scores$S, xlab="G1 score", ylab="S score", pch=16)

细胞周期分布.png

随后将细胞周期信息导入到sce$phase中

sce$phase <- assigned$phases
table(assigned$phases)

观察细胞的平均表达丰度

ave.counts <- calcAverage(sce, use_size_factors=FALSE)
rowData(sce)$AveCount <- ave.counts
hist(log10(ave.counts), breaks=100, main="", col="grey", xlab=expression(Log[10]~"average count"))

ncells <- nexprs(sce, byrow=TRUE)
rowData(sce)$Ncells <- ncells
plot(ave.counts, ncells, xlab="Average count", ylab="Number of cells", log="x", pch=16, cex=0.5)

平均表达丰度.png

画出最高表达的基因

fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16))
plotHighestExprs(sce, n=50) + fontsize

top50高表达基因.png

筛选表达量大于0的基因

keep <- ave.counts > 0
sce <- sce[keep,]
summary(keep)

标准化

首先快速聚类，之后按照不同的cluster进行标准化

set.seed(10000)
clusters <- quickCluster(sce, method="igraph", min.mean=1)
sce <- computeSumFactors(sce, cluster=clusters, min.mean=1)
summary(sizeFactors(sce))

对于spike-in基因是单独标准化的，从而会产生更差的相关性

sce <- computeSpikeFactors(sce, type="ERCC", general.use=FALSE) 
plot(sizeFactors(sce), sizeFactors(sce, type="ERCC"), log="xy", ylab="Spike-in size factor", 
xlab="Deconvolution size factor", col=ifelse(sce$phenotype=="naive", "black", "grey"))

spike-in基因的size factor相关性.png

最后，利用size factor进行标准化

sce <- normalize(sce)

表型与批次的密度图，批次曲线出现双峰

plotExplanatoryVariables(sce, variables=c("phenotype", "batch")) + fontsize

密度图.png

随后使用removeBatchEffect函数根据表型情况移除批次效应

norm_exprs(sce) <- removeBatchEffect(logcounts(sce),
design=model.matrix(~sce$phenotype), batch=sce$batch)
assayNames(sce)

保存sce以便进行下游分析

saveRDS(sce, file="E:/tmp/sce_all.rds")
sessionInfo()

单细胞转录组-1

单细胞转录组-1

单细胞练习1

标准化

推荐阅读更多精彩内容