单细胞/时空文章代码复现——数据集整合

我们在上一篇中已经对损伤愈合组的单细胞转录组数据进行了细胞类型注释，接下来，我们使用整合分析，将文中数据与其他文章数据整合来验证注释的准确性，并借助已注释好的单细胞转录组数据去注释其他的单细胞数据。
本篇除之前一直使用的损伤愈合组数据外，其他数据来自GEO数据库的GSE265972（VU）、GSE165816（DFU）和EBI的E-MTAB-8142（human_adult_skin）。

1. 损伤愈合组与其他人皮肤数据的整合

在正文中，作者为了验证细胞注释的结果，将损伤愈合组的单细胞转录组数据与已发表的一个成年人皮肤的单细胞转录组数据（E-MTAB-8142）进行了整合。该数据可以从EBI数据库中获取，下载的两个文件分别为表达矩阵文本文件（arrayexpress_counts.txt）和细胞元信息文件（arrayexpress_metadata.txt），将这两个文件读入后创建为SeuratObject，并使用元信息替换meta.data。我们依然使用harmony对两个数据集进行整合，在整合时以样本维度而不是分组维度。

library(Seurat)
library(harmony)
library(dplyr)
library(ggplot2)

###读入注释好的acute_wound数据
obj <- readRDS('/path/to/sc_data/cell_type.rds')
obj$group <- 'acute_wound'

###读入E-MTAB-8142数据并转换为Seurat对象
count <- read.table('E-MTAB-8142/arrayexpress_counts.txt', header=T, row.names='Gene', sep='\t')
pub <- CreateSeuratObject(count, assay='RNA', min.cells=3, min.features=200)
meta <- read.table('E-MTAB-8142/arrayexpress_metadata.txt', header=T, row.names='Cell', sep='\t')
pub@meta.data <- meta

pub$group <- 'human_adult_skin'
colnames(pub@meta.data)[5] <- 'main_type'
colnames(pub@meta.data)[6] <- 'cell_type'

###合并两个对象
DefaultAssay(obj) <- 'RNA'
data_merge <- merge(x=obj, y=pub)

data_merge <- SCTransform(data_merge, verbose = FALSE, return.only.var.genes = FALSE, assay='RNA')
data_merge <- RunPCA(data_merge, verbose = FALSE)
data_merge <- RunUMAP(data_merge, dims=1:30)
DimPlot(data_merge, reduction = "umap", group.by='group', pt.size=0.3, label = F, repel = TRUE)

###整合
data_merge <- RunHarmony(data_merge, group.by.vars = "orig.ident", dims=1:30, reduction.use='pca', assay.use = "SCT")
data_merge <- RunUMAP(data_merge, reduction = "harmony", dims = 1:30)
DimPlot(data_merge, reduction = "umap", group.by='group', pt.size=0.3, label = F, repel = TRUE)

整合前两组样本的UMAP图

整合后两组样本的UMAP图

通过UMAP图来可视化两个数据集分别的注释结果。

DimPlot(data_merge, reduction = "umap", group.by='main_type', 
        split.by='group', pt.size=0.1, label = T, label.size=3, repel = TRUE) + NoLegend()

两个数据集的细胞大类

原文图

对比原文图，我们可以看出harmony无法完全消除两个数据集之间的差异，说明两个数据集在生物学背景上是不同的；而不同数据集的主要细胞类型在UMAP图的分布上又是基本一致的，说明整合具有较好的效果。
接下来，文中又使用FindTransferAnchors来寻找两个数据集之间的锚点，并计算预测得分来表征细胞亚类之间的相似性。文中以每个query的cell_type中预测的各reference中细胞类型得分的中位数来指征二者的相似性。

###Find anchor
obj.anchors <- FindTransferAnchors(reference = pub, query = obj, dims = 1:30)
predictions <- TransferData(anchorset = obj.anchors, refdata = pub$cell_type, dims = 1:30)

meta <- cbind(acute_wound=obj@meta.data$cell_type, predictions)
meta <- meta[, c(-2,-43)]
colnames(meta) <- c("acute_wound", "Fibroblast_2", "Fibroblast_3", "Stromal_Schwann", 
                     "Fibroblast_1", "Schwann", "Pericyte", 
                     "Vascular_endothelium_2", "Vascular_endothelium_1", "Vascular_endothelium_3", 
                     "Keratinocyte_postmitotic", "Keratinocyte_CD83", "Keratinocyte_premitotic", "Keratinocyte_mitotic", 
                     "Melanocyte", "Plasma_cell", "Macrophage_2", "DC2","Macrophage_1",
                     "IL23_DC","Monocyte","moDC1","Migratory_cDC","DC1","pDC","moDC2",
                     "LC4","Th","NK2","LC2","KLF10.LC","LC3","LC1",
                     "Mast_cell","Treg","ILC","NK1","Tc",
                     "Lymphatic_endothelium_2","Lymphatic_endothelium_1","NK3")
###计算median prediction score并生成矩阵
mat <- matrix(, nrow=length(levels(factor(meta$acute_wound))), 
              ncol=length(colnames(meta))-1,
              dimnames=list(levels(factor(meta$acute_wound)),colnames(meta)[-1]))
for (i in levels(factor(meta$acute_wound))) {
    for (j in 2:length(colnames(meta))) {
        mat[i, colnames(meta)[j]] <- median(meta[meta$acute_wound==i, colnames(meta)[j]])
    }
}

这样我们就获得了一个行为acute_wound的细胞类型，列为human_adult_skin的细胞类型，填充值为每个acute_wound的细胞类型中每个human_adult_skin细胞类型预测得分的中位数（0~1）。下面我们使用ComplexHeatmap包做出文中的热图。

library(ComplexHeatmap)
###矩阵排序
mat_ord <- mat[c("Bas-I", "Bas-prolif", "Bas-mig", 
                 "Spi-I", "Spi-II", "Spi-mig", 
                 "Gra", "HF", 
                 "MEL", 
                 "FB-I", "FB-III", "FB-prolif", 
                 "PC-vSMC", "LE", "VE", 
                 "NK-cell", "Th", "Plasma_Bcell", "Mast-cell", 
                 "Mono-Mac", "cDC1", "cDC2", "DC3", "LC"), 
               c("Keratinocyte_CD83", "Keratinocyte_mitotic", "Keratinocyte_postmitotic", "Keratinocyte_premitotic",
                 "Melanocyte",
                 "Schwann", "Stromal_Schwann", 
                 "Fibroblast_1", "Fibroblast_2", "Fibroblast_3", 
                 "Pericyte", "pDC",
                 "Vascular_endothelium_1", "Vascular_endothelium_2", "Vascular_endothelium_3", 
                 "Lymphatic_endothelium_1", "Lymphatic_endothelium_2",
                 "ILC", "NK1", "NK2", "NK3",
                 "IL23_DC", "Tc", "Th", "Treg",
                 "Mast_cell", "Plasma_cell",
                 "Macrophage_1", "Macrophage_2", 
                 "DC1", "DC2", "LC1", "LC2", "LC3", "LC4",
                 "Monocyte", "Migratory_cDC", "KLF10.LC", 
                 "moDC1", "moDC2")]
 ###为细胞类型设置颜色                
 aw.cols <- c("#d94701", "#fd8d3c", "#fdbe85",
             "#33A02C", "#72BF5A", "#B2DF8A",
             "#f768a1", "#d4b9da",
             "#737373",
             "#0570b0", "#92c5de", "#d1e5f0",
             "#1a9850", "#fb9a99", "#8d4720",
             "#35978f", "#41b6c4", "#80cdc1","#df65b0",
             "#dd3497", "#807dba","#6a3d9a","#9e9ac8", "#b15928")

names(aw.cols) <- levels(factor(meta[meta$group=='acute_wound', 'cell_type'], 
                      levels=c("Bas-I", "Bas-prolif", "Bas-mig", 
                               "Spi-I", "Spi-II", "Spi-mig", 
                               "Gra", "HF", 
                               "MEL", 
                               "FB-I", "FB-III", "FB-prolif", 
                               "PC-vSMC", "LE", "VE", 
                               "NK-cell", "Th", "Plasma_Bcell", "Mast-cell", 
                               "Mono-Mac", "cDC1", "cDC2", "DC3", "LC"),
                      ordered=T))
                      
 pub.cols <- c("#B5B6DB", "#166E8A", "#7B7DC6", "#7DBFD4",
              "#D5DEA0",
              "#A6559B", "#3E64A2", 
              "#A4D7E2", "#6BA0D5", "#3A64AD",
              "#E4DCC0", "#F19570", 
              "#EAD6E8", "#EF93AA", "#E23725", 
              "#0D783D", "#7FBD70",
              "#CEDFEF", "#4DA1D1", "#1B4179", "#0F223E",
              "#86AB3E", "#A0CA7A", "#90398F", "#A67FBA", 
              "#C39371", "#F2DDEB", 
              "#D43A6B", "#B97781", 
              "#DF7A90", "#E49CC4", 
              "#194791", "#BADFE9", "#9CDCED", "#1F5BBD",
              "#852E8A", "#EFC1DA", "#7D85BA", 
              "#4AB6AF", "#68D4CD")
             
names(pub.cols) <- levels(factor(meta[meta$group=='human_adult_skin', 'cell_type'], 
                          levels=c("Keratinocyte_CD83", "Keratinocyte_mitotic", "Keratinocyte_postmitotic", "Keratinocyte_premitotic",
                                   "Melanocyte",
                                   "Schwann", "Stromal_Schwann", 
                                   "Fibroblast_1", "Fibroblast_2", "Fibroblast_3", 
                                   "Pericyte", "pDC",
                                   "Vascular_endothelium_1", "Vascular_endothelium_2", "Vascular_endothelium_3", 
                                   "Lymphatic_endothelium_1", "Lymphatic_endothelium_2",
                                   "ILC", "NK1", "NK2", "NK3",
                                   "IL23_DC", "Tc", "Th", "Treg",
                                   "Mast_cell", "Plasma_cell",
                                   "Macrophage_1", "Macrophage_2",
                                   "DC1", "DC2", "LC1", "LC2", "LC3", "LC4",
                                   "Monocyte", "Migratory_cDC", "KLF10.LC", 
                                   "moDC1", "moDC2"),
                          ordered=T))
###行、列注释
row_an <- rowAnnotation(foo=rownames(mat_ord), 
                        col=list(foo=aw.cols), show_legend=F, 
                        show_annotation_name = FALSE)
col_an <- HeatmapAnnotation(foo=colnames(mat_ord), 
                            col=list(foo=pub.cols), show_legend=F, 
                            show_annotation_name = FALSE)
###做热图
Heatmap(mat_ord, 
        cluster_rows=F, cluster_columns=F, 
        col=colorRamp2(c(0, 0.5, 1),c('#1C1C1C', '#CD3278', '#FFF68F')), 
        rect_gp=gpar(lwd=2, col='grey'), 
        right_annotation=row_an, bottom_annotation=col_an, 
        heatmap_legend_param = list(title = "Median predicted score"))

复现图

原文图

原文应该是对human_adult_skin的细胞类型进行了部分的合并，因此我们做出的图的横轴和原文有一定差别，但是这不妨碍我们从热图中看出两个数据集的cell_type注释具有较高的一致性。

2. 损伤愈合组与VU、DFU的整合

我们首先使用前面介绍的预处理和组内整合方法，将VU和DFU数据分别进行预处理和整合。然后使用harmony整合三个数据集，由于我们前面介绍了多次harmony的整合代码，这里不做赘述，只是注意在整合前可以给三个分组加上一个label，以便后续作图或进行其他分析时可以根据label进行拆分。

obj1$group <- 'acute_wound'
obj2$group <- 'VU'
obj3$group <- 'DFU'

直接来看下整合的结果。

三个数据集整合后的UMAP图

可以看到harmony较好的将三个数据集整合到一起。

文中首先使用FindTransferAnchors和MapQuery函数将acute_wound的cell_type label转移至VU和DFU中。之后，又结合整合结果的无监督聚类对细胞类型进行精细化标注，将模糊的细胞类型分配结果与各细胞群体中的主要类型进行匹配，来实现急慢性伤口间细胞异质性的精准比较。

但我们缺乏相应的背景知识，所以我们仅将label转移和无监督聚类的结果进行展示，后面还会用到三个数据集的注释结果进行分析，这里暂且按下不表。

obj1 <- readRDS('/path/to/sc_data/cell_type.rds')
obj1$group <- 'acute_wound'
obj1 <- RunUMAP(obj1, reduction = "harmony", dims = 1:30, return.model = TRUE)

data_merge <- readRDS('three_inte.rds')

data.anchors <- FindTransferAnchors(reference = obj1, query = data_merge, dims = 1:30, reference.reduction = "pca")

data_merge <- MapQuery(anchorset = data.anchors, 
                       reference = obj1, query = data_merge, 
                       refdata = list(celltype = "cell_type"), 
                       reference.reduction = "pca", reduction.model = "umap")

data_merge2 <- MapQuery(anchorset = data.anchors, 
                        reference = obj1, query = data_merge,
                        refdata = list(celltype = "main_type"), 
                        reference.reduction = "pca", reduction.model = "umap")

cols <- c("#1c6597", "#cc6805", "#c6abd4", "#569395", "#dc65d8", "#f99190", "#057605","#575757")
names(cols) <- c('Fibroblast', 'Keratinocyte', 'Myeloid', 'Lymphoid', 'Mast', 'Endothelial', 'Pericyte and SMC', 'Melanocyte')

DimPlot(data_merge2, group.by='predicted.celltype', split.by='group', pt.size=0.3, cols=cols, raster.dpi=c(256, 256))

cols <- c(colorRampPalette(c("#1f78b4", "#a6cee3"))(3),
          colorRampPalette(c("#33a02c", "#b2df8a"))(3), "#df65b0",  
          colorRampPalette(c("#fdbf6f", "#ff7f00"))(3), 
          "#696969", "#d8bfd8", "#008b00", "#fb9a99", "#8b5a2b", "#da70d6",
          colorRampPalette(c("#66cdaa", "#5f9ea0"))(2), 
          colorRampPalette(c("#cab2d6", "#6a3d9a"))(6)）
names(cols) <- c("FB-I","FB-III","FB-prolif",
                 "Spi-I","Spi-II","Spi-mig","Gra",
                 "Bas-I","Bas-mig","Bas-prolif",
                 "MEL","HF","PC-vSMC","LE","VE",
                 "Mast-cell","Th","NK-cell",
                 "Mono-Mac","cDC1","cDC2","DC3","Plasma_Bcell","LC")
                 
DimPlot(data_merge, group.by='predicted.celltype', split.by='group', pt.size=0.3, cols=cols, raster.dpi=c(256, 256))

主要细胞类型在三个数据集中的分布

原文图

细胞亚类的UMAP图

我们暂时以细胞亚类label转移的结果作为VU和DFU的细胞类型注释结果，在后面进行分析时再结合无监督聚类结果调整。

以上，我们就完成了验证注释准确性的数据整合，和使用已注释好的单细胞转录组数据去注释其他的单细胞数据的数据整合。

往期内容：
单细胞/时空文章代码复现——数据预处理及整合
 单细胞/时空文章代码复现——细胞类型注释及UMAP图优化