pseudotime

比较不同pseudoytime分析工具

在许多情况下，我们都在研究细胞不断变化的过程。例如，这包括在发育过程中发生的许多分化过程:在刺激后，细胞将从一种细胞类型转变为另一种细胞类型。由于一些细胞在分化过程中比其他细胞进行得更快，因此每个snapshot可能包含在发育过程中不同位置的细胞。我们使用统计方法对细胞沿着一个或多个代表潜在发育轨迹的轨迹进行排序，这种排序被称为伪时间。

library(SingleCellExperiment)
library(TSCAN)
library(M3Drop)
library(monocle)
library(destiny)
library(SLICER)
library(scater)

## Warning: package 'scater' was built under R version 3.5.2

library(ggplot2)
library(ggthemes)
library(ggbeeswarm)
library(corrplot)

## Warning: package 'corrplot' was built under R version 3.5.2

set.seed(1234)
deng_SCE <- readRDS("D:/paopaoR/deng/deng-reads.rds")
deng_SCE

## class: SingleCellExperiment 
## dim: 22431 268 
## metadata(0):
## assays(2): counts logcounts
## rownames(22431): Hvcn1 Gbp7 ... Sox5 Alg11
## rowData names(10): feature_symbol is_feature_control ...
##   total_counts log10_total_counts
## colnames(268): 16cell 16cell.1 ... zy.2 zy.3
## colData names(30): cell_type2 cell_type1 ... pct_counts_ERCC
##   is_cell_control
## reducedDimNames(0):
## spikeNames(1): ERCC

deng_SCE$cell_type2 <- factor(
    deng_SCE$cell_type2,
    levels = c("zy", "early2cell", "mid2cell", "late2cell",
                        "4cell", "8cell", "16cell", "earlyblast",
                        "midblast", "lateblast")
)
cellLabels <- deng_SCE$cell_type2

数据集由来自小鼠早期发育的10个不同时间点的268个细胞组成。细胞label可以作为真实时间标准来评价伪时间的准确性。

deng <- counts(deng_SCE)
colnames(deng) <- cellLabels
deng_SCE <- runPCA(deng_SCE)
plotPCA(deng_SCE, colour_by = "cell_type2")

## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.

image.png

deng_SCE$PC1 <- reducedDim(deng_SCE)[,1]
ggplot(as.data.frame(colData(deng_SCE)), aes(x = PC1, y = cell_type2, 
                              colour = cell_type2)) +
    geom_quasirandom(groupOnX = FALSE) +
    scale_color_tableau() + theme_classic() +
    xlab("First principal component") + ylab("Timepoint") +
    ggtitle("Cells ordered by first principal component")

image.png

TSCAN

procdeng <- TSCAN::preprocess(deng)
colnames(procdeng) <- 1:ncol(deng)
dengclust <- TSCAN::exprmclust(procdeng, clusternum = 10)
TSCAN::plotmclust(dengclust)

image.png

dengorderTSCAN <- TSCAN::TSCANorder(dengclust, orderonly = FALSE)
pseudotime_order_tscan <- as.character(dengorderTSCAN$sample_name)
deng_SCE$pseudotime_order_tscan <- NA
deng_SCE$pseudotime_order_tscan[as.numeric(dengorderTSCAN$sample_name)] <- 
    dengorderTSCAN$Pseudotime

cellLabels[dengclust$clusterid == 10]

##  [1] late2cell late2cell late2cell late2cell late2cell late2cell late2cell
##  [8] late2cell late2cell late2cell
## 10 Levels: zy early2cell mid2cell late2cell 4cell 8cell ... lateblast

ggplot(as.data.frame(colData(deng_SCE)), 
       aes(x = pseudotime_order_tscan, 
           y = cell_type2, colour = cell_type2)) +
    geom_quasirandom(groupOnX = FALSE) +
    scale_color_tableau() + theme_classic() +
    xlab("TSCAN pseudotime") + ylab("Timepoint") +
    ggtitle("Cells ordered by TSCAN pseudotime")

## Warning: Removed 47 rows containing missing values (position_quasirandom).

image.png

MONOCLE

m3dGenes <- as.character(
    M3DropFeatureSelection(deng)$Gene
)

## Warning in bg__calc_variables(expr_mat): Warning: Removing 1134 undetected
## genes.

image.png

d <- deng[which(rownames(deng) %in% m3dGenes), ]
d <- d[!duplicated(rownames(d)), ]

colnames(d) <- 1:ncol(d)
geneNames <- rownames(d)
rownames(d) <- 1:nrow(d)
pd <- data.frame(timepoint = cellLabels)
pd <- new("AnnotatedDataFrame", data=pd)
fd <- data.frame(gene_short_name = geneNames)
fd <- new("AnnotatedDataFrame", data=fd)

dCellData <- newCellDataSet(d, phenoData = pd, featureData = fd, expressionFamily = tobit())
dCellData <- setOrderingFilter(dCellData, which(geneNames %in% m3dGenes))
dCellData <- estimateSizeFactors(dCellData)
dCellDataSet <- reduceDimension(dCellData, pseudo_expr = 1)

## Warning in if (cds@expressionFamily@vfamily %in% c("negbinomial",
## "negbinomial.size")) {: 条件的长度大于一，因此只能用其第一元素

## Warning in if (cds@expressionFamily@vfamily == "binomialff") {: 条件的长度
## 大于一，因此只能用其第一元素

## Warning in if (cds@expressionFamily@vfamily == "Tobit") {: 条件的长度大于
## 一，因此只能用其第一元素

## Warning in if (cds@expressionFamily@vfamily == "uninormal") {: 条件的长度大
## 于一，因此只能用其第一元素

dCellDataSet <- orderCells(dCellDataSet, reverse = FALSE)
plot_cell_trajectory(dCellDataSet)

image.png

pseudotime_monocle <-
    data.frame(
        Timepoint = phenoData(dCellDataSet)$timepoint,
        pseudotime = phenoData(dCellDataSet)$Pseudotime,
        State = phenoData(dCellDataSet)$State
    )
rownames(pseudotime_monocle) <- 1:ncol(d)
pseudotime_order_monocle <-
    rownames(pseudotime_monocle[order(pseudotime_monocle$pseudotime), ])

deng_SCE$pseudotime_monocle <- pseudotime_monocle$pseudotime
ggplot(as.data.frame(colData(deng_SCE)), 
       aes(x = pseudotime_monocle, 
           y = cell_type2, colour = cell_type2)) +
    geom_quasirandom(groupOnX = FALSE) +
    scale_color_tableau() + theme_classic() +
    xlab("monocle pseudotime") + ylab("Timepoint") +
    ggtitle("Cells ordered by monocle pseudotime")

image.png

Diffusion maps

deng <- logcounts(deng_SCE)
colnames(deng) <- cellLabels
dm <- DiffusionMap(t(deng))

tmp <- data.frame(DC1 = eigenvectors(dm)[,1],
                  DC2 = eigenvectors(dm)[,2],
                  Timepoint = deng_SCE$cell_type2)
ggplot(tmp, aes(x = DC1, y = DC2, colour = Timepoint)) +
    geom_point() + scale_color_tableau() + 
    xlab("Diffusion component 1") + 
    ylab("Diffusion component 2") +
    theme_classic()

image.png

deng_SCE$pseudotime_diffusionmap <- rank(eigenvectors(dm)[,1])
ggplot(as.data.frame(colData(deng_SCE)), 
       aes(x = pseudotime_diffusionmap, 
           y = cell_type2, colour = cell_type2)) +
    geom_quasirandom(groupOnX = FALSE) +
    scale_color_tableau() + theme_classic() +
    xlab("Diffusion map pseudotime (first diffusion map component)") +
    ylab("Timepoint") +
    ggtitle("Cells ordered by diffusion map pseudotime")

image.png

SLICER

这个包没有很常见到。找到可能的start细胞后，branch分析有时候会报错，GitHub上也有人同样的问题，换个细胞可能会运行成功，但是结果可能就不对了。

library("lle")

## Warning: package 'lle' was built under R version 3.5.2

## Warning: package 'snowfall' was built under R version 3.5.2

genes <- select_genes(t(deng))
k <- select_k(t(deng[genes,]), kmin = 30, kmax=60)

## finding neighbours
## calculating weights
## computing coordinates
## finding neighbours
## calculating weights
## computing coordinates
## finding neighbours
## calculating weights
## computing coordinates
## finding neighbours
## calculating weights
## computing coordinates
## finding neighbours
## calculating weights
## computing coordinates
## finding neighbours
## calculating weights
## computing coordinates
## finding neighbours
## calculating weights
## computing coordinates

traj_lle <- lle(t(deng[genes,]), m = 2, k)$Y

## finding neighbours
## calculating weights
## computing coordinates

reducedDim(deng_SCE, "LLE") <- traj_lle
plotReducedDim(deng_SCE, use_dimred = "LLE", colour_by = "cell_type2") +
    xlab("LLE component 1") + ylab("LLE component 2") +
    ggtitle("Locally linear embedding of cells from SLICER")

## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.

image.png

traj_graph <- conn_knn_graph(traj_lle, 10)
plot(traj_graph, main = "Fully connected kNN graph from SLICER")

image.png

ends <- find_extreme_cells(traj_graph, traj_lle)

image.png

start <- ends[1]

pseudotime_order_slicer <- cell_order(traj_graph, start)
branches <- assign_branches(traj_graph, start)

pseudotime_slicer <-
    data.frame(
        Timepoint = cellLabels,
        pseudotime = NA,
        State = branches
    )
pseudotime_slicer$pseudotime[pseudotime_order_slicer] <-
    1:length(pseudotime_order_slicer)
deng_SCE$pseudotime_slicer <- pseudotime_slicer$pseudotime

ggplot(as.data.frame(colData(deng_SCE)), 
       aes(x = pseudotime_slicer, 
           y = cell_type2, colour = cell_type2)) +
    geom_quasirandom(groupOnX = FALSE) +
    scale_color_tableau() + theme_classic() +
    xlab("SLICER pseudotime (cell ordering)") +
    ylab("Timepoint") +
    theme_classic()

image.png

单细胞学习5

单细胞学习5

pseudotime

比较不同pseudoytime分析工具

TSCAN

MONOCLE

Diffusion maps

SLICER