8.8 Seurat v3, 3’ 10k PBMC和全血STRT-Seq
尽管我们的/data文件夹中已经有了所有必要的文件,我们仍可以从GEO数据库下载必要的文件:
download.file("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE149nnn/GSE149938/suppl/GSE149938_umi_matrix.csv.gz",
destfile = "GSE149938_umi_matrix.csv.gz")
download.file("https://cf.10xgenomics.com/samples/cell-exp/4.0.0/Parent_NGSC3_DI_PBMC/Parent_NGSC3_DI_PBMC_filtered_feature_bc_matrix.h5",
destfile = "3p_pbmc10k_filt.h5")
> umi_gz <- gzfile("data/update/GSE149938_umi_matrix.csv.gz",'rt')
> umi <- read.csv(umi_gz,check.names = F,quote = "")
> matrix_3p <- Read10X_h5("data/update/3p_pbmc10k_filt.h5",use.names = T)
接下来,让我们创建Seurat
对象并重新定义一些元数据列(GEO数据集将细胞类型放入orig.ident
slot,这会干扰我们接下来要做的事情):
> srat_wb <- CreateSeuratObject(t(umi),project = "whole_blood")
> srat_3p <- CreateSeuratObject(matrix_3p,project = "pbmc10k_3p")
> rm(umi_gz)
> rm(umi)
> rm(matrix_3p)
> colnames(srat_wb@meta.data)[1] <- "cell_type"
> srat_wb@meta.data$orig.ident <- "whole_blood"
> srat_wb@meta.data$orig.ident <- as.factor(srat_wb@meta.data$orig.ident)
> head(srat_wb[[]])
cell_type nCount_RNA nFeature_RNA orig.ident
BNK_spBM1_L1_bar25 BNK 24494 1869 whole_blood
BNK_spBM1_L1_bar26 BNK 61980 2051 whole_blood
BNK_spBM1_L1_bar27 BNK 124382 3872 whole_blood
BNK_spBM1_L1_bar28 BNK 8144 1475 whole_blood
BNK_spBM1_L1_bar29 BNK 53612 2086 whole_blood
BNK_spBM1_L1_bar30 BNK 33582 2038 whole_blood
进行基本的QC。STRT-seq与10x有很大不同,每个细胞检测到的基因要多得多。此外,出于某种原因,全血数据集的矩阵中没有MT基因,但这并不重要。
> srat_wb <- SetIdent(srat_wb,value = "orig.ident")
> srat_wb[["percent.mt"]] <- PercentageFeatureSet(srat_wb, pattern = "^MT-")
> srat_wb[["percent.rbp"]] <- PercentageFeatureSet(srat_wb, pattern = "^RP[SL]")
> srat_3p[["percent.mt"]] <- PercentageFeatureSet(srat_3p, pattern = "^MT-")
> srat_3p[["percent.rbp"]] <- PercentageFeatureSet(srat_3p, pattern = "^RP[SL]")
> VlnPlot(srat_wb, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rbp"), ncol = 4)
> VlnPlot(srat_3p, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rbp"), ncol = 4)
用于处理GEO全血数据集的注释文件与Cell Ranger GRCh38-2020A有很大不同。让我们看看有多少个共有基因:
> table(rownames(srat_3p) %in% rownames(srat_wb))
FALSE TRUE
18050 18551
> common_genes <- rownames(srat_3p)[rownames(srat_3p) %in% rownames(srat_wb)]
过滤基因数量过高或过低、MT基因含量过高的细胞。另外,让我们将单个矩阵限制为仅包含共有基因:
> srat_3p <- subset(srat_3p, subset = nFeature_RNA > 500 & nFeature_RNA < 5000 & percent.mt < 15)
> srat_wb <- subset(srat_wb, subset = nFeature_RNA > 1000 & nFeature_RNA < 6000)
> srat_3p <- srat_3p[rownames(srat_3p) %in% common_genes,]
> srat_wb <- srat_wb[rownames(srat_wb) %in% common_genes,]
与之前的Seurat
v3一样,让我们列出一个列表并为每个对象标准化/筛选HVG:
> wb_list <- list()
> wb_list[["pbmc10k_3p"]] <- srat_3p
> wb_list[["whole_blood"]] <- srat_wb
> for (i in 1:length(wb_list)) {
wb_list[[i]] <- NormalizeData(wb_list[[i]], verbose = F)
wb_list[[i]] <- FindVariableFeatures(wb_list[[i]], selection.method = "vst", nfeatures = 2000, verbose = F)
}
接下来进行整合,Seurat 3分两步完成。
> wb_anchors <- FindIntegrationAnchors(object.list = wb_list, dims = 1:30)
> wb_seurat <- IntegrateData(anchorset = wb_anchors, dims = 1:30)
> rm(wb_list)
> rm(wb_anchors)
对未整合的数据集进行基本的处理和可视化:
> DefaultAssay(wb_seurat) <- "RNA"
> wb_seurat <- NormalizeData(wb_seurat, verbose = F)
> wb_seurat <- FindVariableFeatures(wb_seurat, selection.method = "vst", nfeatures = 2000, verbose = F)
> wb_seurat <- ScaleData(wb_seurat, verbose = F)
> wb_seurat <- RunPCA(wb_seurat, npcs = 30, verbose = F)
> wb_seurat <- RunUMAP(wb_seurat, reduction = "pca", dims = 1:30, verbose = F)
> DimPlot(wb_seurat,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, before integration")
现在,让我们看一下整合的数据集:
> DefaultAssay(wb_seurat) <- "integrated"
> wb_seurat <- ScaleData(wb_seurat, verbose = F)
> wb_seurat <- RunPCA(wb_seurat, npcs = 30, verbose = F)
> wb_seurat <- RunUMAP(wb_seurat, reduction = "pca", dims = 1:30, verbose = F)
> DimPlot(wb_seurat, reduction = "umap") + plot_annotation(title = "10k 3' PBMC and white blood cells, after integration (Seurat 3)")
让我们看一些marker:
> FeaturePlot(wb_seurat,c("MS4A1","LYZ","NKG7","PPBP","LTF","HBA1","FCER1A","IL7R","FCGR3B")) & scale_colour_gradientn(colours = rev(brewer.pal(n = 11, name = "Spectral")))
从图中我们可以看出,有一些重要的细胞类型在PBMC数据集中不存在,但存在于全血数据集中。LTF基因是中性粒细胞最显著的marker,HBA1是在红细胞中表达的血红蛋白基因。
现在让我们对整合矩阵进行聚类,并查看聚类在两个数据集之间的分布情况:
> wb_seurat <- FindNeighbors(wb_seurat, dims = 1:30, k.param = 10, verbose = F)
> wb_seurat <- FindClusters(wb_seurat, verbose = F)
> DimPlot(wb_seurat,label = T) + NoLegend()
聚类组成显示了全血数据集独有的许多聚类:
> count_table <- table(wb_seurat@meta.data$seurat_clusters, wb_seurat@meta.data$orig.ident)
> count_table
pbmc10k_3p whole_blood
0 1426 237
1 1385 71
2 1264 130
3 1211 112
4 1115 145
5 0 1052
6 355 467
7 377 211
8 386 199
9 0 550
10 343 157
11 390 82
12 0 441
13 283 125
14 7 388
15 3 380
16 19 362
17 4 367
18 2 316
19 297 13
20 0 308
21 0 265
22 0 222
23 0 221
24 15 179
25 106 22
26 93 19
27 0 103
28 77 3
29 0 50
30 32 2
31 0 32
> plot_integrated_clusters(wb_seurat)
我们可以利用GSE149938中存在的元数据:
> meta <- wb_seurat[[]]
> table(meta[meta$seurat_clusters == '5',]$cell_type) ## erythrocytes
BNK CMP ery immB MEP MLP preB proB toxiNK
1 2 1040 3 1 1 2 1 1
> table(meta[meta$seurat_clusters == '20',]$cell_type) ## neutrophils
BNK CLP CMP HSC immB kineNK matureN metaN MLP myeN
2 2 1 1 1 55 1 7 1 99
plasma proN toxiNK
1 136 1
> table(meta[meta$seurat_clusters == '24',]$cell_type) ## plasma
ery naiB plasma
11 2 166
> table(meta[meta$seurat_clusters == '16',]$cell_type) ## platelets
BNK CLP CMP ery HSC LMPP matureN MEP MPP NKP
1 3 61 4 72 1 1 144 74 1
> rm(wb_seurat)
8.9 Harmony, 3’ 10k PBMC和全血STRT-Seq
首先创建一个合并的Seurat
数据集,并对其进行标准化和后续处理:
> wb_harmony <- merge(srat_3p,srat_wb)
> wb_harmony <- NormalizeData(wb_harmony, verbose = F)
> wb_harmony <- FindVariableFeatures(wb_harmony, selection.method = "vst", nfeatures = 2000, verbose = F)
> wb_harmony <- ScaleData(wb_harmony, verbose = F)
> wb_harmony <- RunPCA(wb_harmony, npcs = 30, verbose = F)
> wb_harmony <- RunUMAP(wb_harmony, reduction = "pca", dims = 1:30, verbose = F)
看一下PCA图的变化,以及沿第一个主成分的分布:
> p1 <- DimPlot(object = wb_harmony, reduction = "pca", pt.size = .1, group.by = "orig.ident") + NoLegend()
> p2 <- VlnPlot(object = wb_harmony, features = "PC_1", group.by = "orig.ident", pt.size = .1) + NoLegend()
> plot_grid(p1,p2)
UMAP也显示了数据集之间的明显差异:
> DimPlot(wb_harmony,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, before integration")
使用SeuratWrappers
库中名为RunHarmony
的函数来运行harmony
:
> wb_harmony <- wb_harmony %>% RunHarmony("orig.ident", plot_convergence = T)
这会生成我们稍后将用于所有下游分析的降维结果。
> harmony_embeddings <- Embeddings(wb_harmony, 'harmony')
> harmony_embeddings[1:5, 1:5]
harmony_1 harmony_2 harmony_3 harmony_4 harmony_5
AAACCCACATAACTCG-1 2.592137 -1.6045869 3.192993 -0.03594751 3.8447941
AAACCCACATGTAACC-1 4.244764 3.2122684 -9.738222 -5.90380632 0.9607984
AAACCCAGTGAGTCAG-1 3.208084 -2.1054920 2.035577 1.90984384 5.3665634
AAACGAACAGTCAGTT-1 -1.106694 1.8151680 3.092745 -2.34038488 7.6785360
AAACGAACATTCGGGC-1 4.735928 -0.4421468 -2.196355 2.77622970 -3.2385050
校正后的PCA和分布:
> p1 <- DimPlot(object = wb_harmony, reduction = "harmony", pt.size = .1, group.by = "orig.ident") + NoLegend()
> p2 <- VlnPlot(object = wb_harmony, features = "harmony_1", group.by = "orig.ident", pt.size = .1) + NoLegend()
> plot_grid(p1,p2)
进行UMAP降维和Louvain聚类:
> wb_harmony <- wb_harmony %>%
RunUMAP(reduction = "harmony", dims = 1:30, verbose = F) %>%
FindNeighbors(reduction = "harmony", k.param = 10, dims = 1:30) %>%
FindClusters() %>%
identity()
> wb_harmony <- SetIdent(wb_harmony,value = "orig.ident")
> DimPlot(wb_harmony,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and whole blood, after integration (Harmony)")
> DimPlot(wb_harmony, reduction = "umap", group.by = "orig.ident", pt.size = .1, split.by = 'orig.ident') + NoLegend()
该数据集的整合结果看起来与Seurat v3非常相似:
> wb_harmony <- SetIdent(wb_harmony,value = "seurat_clusters")
> DimPlot(wb_harmony,label = T) + NoLegend()
更详细的聚类检查似乎也证实了这一点:
> plot_integrated_clusters(wb_harmony)
> rm(wb_harmony)
8.10 LIGER, 3’ 10k PBMC和全血STRT-Seq
最后,使用LIGER
进行数据整合。此步骤需要一些时间来运行:
> wb_liger <- merge(srat_3p,srat_wb)
> wb_liger <- NormalizeData(wb_liger)
> wb_liger <- FindVariableFeatures(wb_liger)
> wb_liger <- ScaleData(wb_liger, split.by = "orig.ident", do.center = F)
> wb_liger <- RunOptimizeALS(wb_liger, k = 30, lambda = 5, split.by = "orig.ident")
> wb_liger <- RunQuantileNorm(wb_liger, split.by = "orig.ident")
然后,使用与之前类似的设置执行Louvain聚类(FindNeighbors
和FindClusters
):
> wb_liger <- FindNeighbors(wb_liger,reduction = "iNMF",k.param = 10,dims = 1:30)
> wb_liger <- FindClusters(wb_liger)
让我们用几种不同的方式来看一下整合后的UMAP图:
> wb_liger <- RunUMAP(wb_liger, dims = 1:ncol(wb_liger[["iNMF"]]), reduction = "iNMF",verbose = F)
> wb_liger <- SetIdent(wb_liger,value = "orig.ident")
> DimPlot(wb_liger,reduction = "umap") + plot_annotation(title = "10k 3' PBMC and 10k 5' PBMC cells, after integration (LIGER)")
> DimPlot(wb_liger, reduction = "umap", group.by = "orig.ident", pt.size = .1, split.by = 'orig.ident') + NoLegend()
最后,看一下每个cluster的分布:
> plot_integrated_clusters(wb_liger)
> rm(wb_liger)
> rm(srat_wb)
> rm(srat_3p)
往期内容:
重生之我在剑桥大学学习单细胞RNA-seq分析——8. scRNA-seq数据整合(1)
重生之我在剑桥大学学习单细胞RNA-seq分析——8. scRNA-seq数据整合(2)
重生之我在剑桥大学学习单细胞RNA-seq分析——8. scRNA-seq数据整合(3)