重生之我在剑桥大学学习单细胞RNA-seq分析——8. scRNA-seq数据整合（4）

8.8 Seurat v3, 3’ 10k PBMC和全血STRT-Seq
尽管我们的/data文件夹中已经有了所有必要的文件，我们仍可以从GEO数据库下载必要的文件：

download.file("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149938/suppl/GSE149938_umi_matrix.csv.gz",
              destfile = "GSE149938_umi_matrix.csv.gz")
download.file("https://cf.10xgenomics.com/samples/cell-exp/4.0.0/Parent_NGSC3_DI_PBMC/Parent_NGSC3_DI_PBMC_filtered_feature_bc_matrix.h5",
              destfile = "3p_pbmc10k_filt.h5")

> umi_gz <- gzfile("data/update/GSE149938_umi_matrix.csv.gz",'rt')  
> umi <- read.csv(umi_gz,check.names = F,quote = "")
> matrix_3p    <- Read10X_h5("data/update/3p_pbmc10k_filt.h5",use.names = T)

接下来，让我们创建Seurat对象并重新定义一些元数据列（GEO数据集将细胞类型放入orig.ident slot，这会干扰我们接下来要做的事情）：

> srat_wb <- CreateSeuratObject(t(umi),project = "whole_blood")
> srat_3p <- CreateSeuratObject(matrix_3p,project = "pbmc10k_3p")
> rm(umi_gz)
> rm(umi)
> rm(matrix_3p)
> colnames(srat_wb@meta.data)[1] <- "cell_type"
> srat_wb@meta.data$orig.ident <- "whole_blood"
> srat_wb@meta.data$orig.ident <- as.factor(srat_wb@meta.data$orig.ident)
> head(srat_wb[[]])
                   cell_type nCount_RNA nFeature_RNA  orig.ident
BNK_spBM1_L1_bar25       BNK      24494         1869 whole_blood
BNK_spBM1_L1_bar26       BNK      61980         2051 whole_blood
BNK_spBM1_L1_bar27       BNK     124382         3872 whole_blood
BNK_spBM1_L1_bar28       BNK       8144         1475 whole_blood
BNK_spBM1_L1_bar29       BNK      53612         2086 whole_blood
BNK_spBM1_L1_bar30       BNK      33582         2038 whole_blood

进行基本的QC。STRT-seq与10x有很大不同，每个细胞检测到的基因要多得多。此外，出于某种原因，全血数据集的矩阵中没有MT基因，但这并不重要。

> srat_wb <- SetIdent(srat_wb,value = "orig.ident")
> srat_wb[["percent.mt"]] <- PercentageFeatureSet(srat_wb, pattern = "^MT-")
> srat_wb[["percent.rbp"]] <- PercentageFeatureSet(srat_wb, pattern = "^RP[SL]")
> srat_3p[["percent.mt"]] <- PercentageFeatureSet(srat_3p, pattern = "^MT-")
> srat_3p[["percent.rbp"]] <- PercentageFeatureSet(srat_3p, pattern = "^RP[SL]")
> VlnPlot(srat_wb, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rbp"), ncol = 4)

> VlnPlot(srat_3p, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rbp"), ncol = 4)

用于处理GEO全血数据集的注释文件与Cell Ranger GRCh38-2020A有很大不同。让我们看看有多少个共有基因：

> table(rownames(srat_3p) %in% rownames(srat_wb))

FALSE  TRUE 
18050 18551
> common_genes <- rownames(srat_3p)[rownames(srat_3p) %in% rownames(srat_wb)]

过滤基因数量过高或过低、MT基因含量过高的细胞。另外，让我们将单个矩阵限制为仅包含共有基因：

> srat_3p <- subset(srat_3p, subset = nFeature_RNA > 500 & nFeature_RNA < 5000 & percent.mt < 15)
> srat_wb <- subset(srat_wb, subset = nFeature_RNA > 1000 & nFeature_RNA < 6000)
> srat_3p <- srat_3p[rownames(srat_3p) %in% common_genes,]
> srat_wb <- srat_wb[rownames(srat_wb) %in% common_genes,]

与之前的Seurat v3一样，让我们列出一个列表并为每个对象标准化/筛选HVG：

> wb_list <- list()
> wb_list[["pbmc10k_3p"]]   <- srat_3p
> wb_list[["whole_blood"]]  <- srat_wb
> for (i in 1:length(wb_list)) {
    wb_list[[i]] <- NormalizeData(wb_list[[i]], verbose = F)
    wb_list[[i]] <- FindVariableFeatures(wb_list[[i]], selection.method = "vst", nfeatures = 2000, verbose = F)
  }

接下来进行整合，Seurat 3分两步完成。

> wb_anchors <- FindIntegrationAnchors(object.list = wb_list, dims = 1:30)
> wb_seurat  <- IntegrateData(anchorset = wb_anchors, dims = 1:30)
> rm(wb_list)
> rm(wb_anchors)

对未整合的数据集进行基本的处理和可视化：

> DefaultAssay(wb_seurat) <- "RNA"
> wb_seurat <- NormalizeData(wb_seurat, verbose = F)
> wb_seurat <- FindVariableFeatures(wb_seurat, selection.method = "vst", nfeatures = 2000, verbose = F)
> wb_seurat <- ScaleData(wb_seurat, verbose = F)
> wb_seurat <- RunPCA(wb_seurat, npcs = 30, verbose = F)
> wb_seurat <- RunUMAP(wb_seurat, reduction = "pca", dims = 1:30, verbose = F) 
> DimPlot(wb_seurat,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, before integration")

现在，让我们看一下整合的数据集：

> DefaultAssay(wb_seurat) <- "integrated"
> wb_seurat <- ScaleData(wb_seurat, verbose = F)
> wb_seurat <- RunPCA(wb_seurat, npcs = 30, verbose = F)
> wb_seurat <- RunUMAP(wb_seurat, reduction = "pca", dims = 1:30, verbose = F)
> DimPlot(wb_seurat, reduction = "umap") + plot_annotation(title = "10k 3' PBMC and white blood cells, after integration (Seurat 3)")

让我们看一些marker：

> FeaturePlot(wb_seurat,c("MS4A1","LYZ","NKG7","PPBP","LTF","HBA1","FCER1A","IL7R","FCGR3B")) & scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))

从图中我们可以看出，有一些重要的细胞类型在PBMC数据集中不存在，但存在于全血数据集中。LTF基因是中性粒细胞最显著的marker，HBA1是在红细胞中表达的血红蛋白基因。
现在让我们对整合矩阵进行聚类，并查看聚类在两个数据集之间的分布情况：

> wb_seurat <- FindNeighbors(wb_seurat, dims = 1:30, k.param = 10, verbose = F)
> wb_seurat <- FindClusters(wb_seurat, verbose = F)
> DimPlot(wb_seurat,label = T) + NoLegend()

聚类组成显示了全血数据集独有的许多聚类：

> count_table <- table(wb_seurat@meta.data$seurat_clusters, wb_seurat@meta.data$orig.ident)
> count_table
    
     pbmc10k_3p whole_blood
  0        1426         237
  1        1385          71
  2        1264         130
  3        1211         112
  4        1115         145
  5           0        1052
  6         355         467
  7         377         211
  8         386         199
  9           0         550
  10        343         157
  11        390          82
  12          0         441
  13        283         125
  14          7         388
  15          3         380
  16         19         362
  17          4         367
  18          2         316
  19        297          13
  20          0         308
  21          0         265
  22          0         222
  23          0         221
  24         15         179
  25        106          22
  26         93          19
  27          0         103
  28         77           3
  29          0          50
  30         32           2
  31          0          32
> plot_integrated_clusters(wb_seurat)

我们可以利用GSE149938中存在的元数据：

> meta <- wb_seurat[[]]
> table(meta[meta$seurat_clusters == '5',]$cell_type) ## erythrocytes

   BNK    CMP    ery   immB    MEP    MLP   preB   proB toxiNK 
     1      2   1040      3      1      1      2      1      1 
> table(meta[meta$seurat_clusters == '20',]$cell_type) ## neutrophils

    BNK     CLP     CMP     HSC    immB  kineNK matureN   metaN     MLP    myeN 
      2       2       1       1       1      55       1       7       1      99 
 plasma    proN  toxiNK 
      1     136       1 
> table(meta[meta$seurat_clusters == '24',]$cell_type) ## plasma

   ery   naiB plasma 
    11      2    166 
> table(meta[meta$seurat_clusters == '16',]$cell_type) ## platelets

    BNK     CLP     CMP     ery     HSC    LMPP matureN     MEP     MPP     NKP 
      1       3      61       4      72       1       1     144      74       1
> rm(wb_seurat)

8.9 Harmony, 3’ 10k PBMC和全血STRT-Seq
首先创建一个合并的Seurat数据集，并对其进行标准化和后续处理：

> wb_harmony    <- merge(srat_3p,srat_wb)
> wb_harmony <- NormalizeData(wb_harmony, verbose = F)
> wb_harmony <- FindVariableFeatures(wb_harmony, selection.method = "vst", nfeatures = 2000, verbose = F)
> wb_harmony <- ScaleData(wb_harmony, verbose = F)
> wb_harmony <- RunPCA(wb_harmony, npcs = 30, verbose = F)
> wb_harmony <- RunUMAP(wb_harmony, reduction = "pca", dims = 1:30, verbose = F)

看一下PCA图的变化，以及沿第一个主成分的分布：

> p1 <- DimPlot(object = wb_harmony, reduction = "pca", pt.size = .1, group.by = "orig.ident") + NoLegend()
> p2 <- VlnPlot(object = wb_harmony, features = "PC_1", group.by = "orig.ident", pt.size = .1) + NoLegend()
> plot_grid(p1,p2)

UMAP也显示了数据集之间的明显差异：

> DimPlot(wb_harmony,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, before integration")

使用SeuratWrappers库中名为RunHarmony的函数来运行harmony：

> wb_harmony <- wb_harmony %>% RunHarmony("orig.ident", plot_convergence = T)

这会生成我们稍后将用于所有下游分析的降维结果。

> harmony_embeddings <- Embeddings(wb_harmony, 'harmony')
> harmony_embeddings[1:5, 1:5]
                   harmony_1  harmony_2 harmony_3   harmony_4  harmony_5
AAACCCACATAACTCG-1  2.592137 -1.6045869  3.192993 -0.03594751  3.8447941
AAACCCACATGTAACC-1  4.244764  3.2122684 -9.738222 -5.90380632  0.9607984
AAACCCAGTGAGTCAG-1  3.208084 -2.1054920  2.035577  1.90984384  5.3665634
AAACGAACAGTCAGTT-1 -1.106694  1.8151680  3.092745 -2.34038488  7.6785360
AAACGAACATTCGGGC-1  4.735928 -0.4421468 -2.196355  2.77622970 -3.2385050

校正后的PCA和分布：

> p1 <- DimPlot(object = wb_harmony, reduction = "harmony", pt.size = .1, group.by = "orig.ident") + NoLegend()
> p2 <- VlnPlot(object = wb_harmony, features = "harmony_1", group.by = "orig.ident", pt.size = .1) + NoLegend()
> plot_grid(p1,p2)

进行UMAP降维和Louvain聚类：

> wb_harmony <- wb_harmony %>% 
    RunUMAP(reduction = "harmony", dims = 1:30, verbose = F) %>% 
    FindNeighbors(reduction = "harmony", k.param = 10, dims = 1:30) %>% 
    FindClusters() %>% 
    identity()
> wb_harmony <- SetIdent(wb_harmony,value = "orig.ident")
> DimPlot(wb_harmony,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, after integration (Harmony)")

> DimPlot(wb_harmony, reduction = "umap", group.by = "orig.ident", pt.size = .1, split.by = 'orig.ident') + NoLegend()

该数据集的整合结果看起来与Seurat v3非常相似：

> wb_harmony <- SetIdent(wb_harmony,value = "seurat_clusters")
> DimPlot(wb_harmony,label = T) + NoLegend()

更详细的聚类检查似乎也证实了这一点：

> plot_integrated_clusters(wb_harmony)

> rm(wb_harmony)

8.10 LIGER, 3’ 10k PBMC和全血STRT-Seq
最后，使用LIGER进行数据整合。此步骤需要一些时间来运行：

> wb_liger    <- merge(srat_3p,srat_wb) 
> wb_liger    <- NormalizeData(wb_liger)
> wb_liger    <- FindVariableFeatures(wb_liger)
> wb_liger    <- ScaleData(wb_liger, split.by = "orig.ident", do.center = F)
> wb_liger    <- RunOptimizeALS(wb_liger, k = 30, lambda = 5, split.by = "orig.ident")
> wb_liger    <- RunQuantileNorm(wb_liger, split.by = "orig.ident")

然后，使用与之前类似的设置执行Louvain聚类（FindNeighbors和FindClusters）：

> wb_liger    <- FindNeighbors(wb_liger,reduction = "iNMF",k.param = 10,dims = 1:30)
> wb_liger    <- FindClusters(wb_liger)

让我们用几种不同的方式来看一下整合后的UMAP图：

> wb_liger    <- RunUMAP(wb_liger, dims = 1:ncol(wb_liger[["iNMF"]]), reduction = "iNMF",verbose = F)
> wb_liger <- SetIdent(wb_liger,value = "orig.ident")
> DimPlot(wb_liger,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and 10k 5' PBMC cells, after integration (LIGER)")

> DimPlot(wb_liger, reduction = "umap", group.by = "orig.ident", pt.size = .1, split.by = 'orig.ident') + NoLegend()

最后，看一下每个cluster的分布：

> plot_integrated_clusters(wb_liger)

> rm(wb_liger)
> rm(srat_wb)
> rm(srat_3p)

往期内容：
重生之我在剑桥大学学习单细胞RNA-seq分析——8. scRNA-seq数据整合（1）
重生之我在剑桥大学学习单细胞RNA-seq分析——8. scRNA-seq数据整合（2）
重生之我在剑桥大学学习单细胞RNA-seq分析——8. scRNA-seq数据整合（3）

重生之我在剑桥大学学习单细胞RNA-seq分析——8. scRNA-seq数据整合（4）

推荐阅读更多精彩内容