说明:仅根据官网指南加个人理解,相应图片参考官网(目前官网上最新的Tutorial已经更新成Seurat3.0版本,下面的流程是2.4版本,有些许出入。新版本将会在2019年4月16日通过CRAN下载)
以下是Seurat(v2.4版本)标准的数据处理过程,包括构建Seurat对象、QC、数据标准化、检测高变异基因等
1.构建Seurat对象
library(Seurat)
library(dplyr)
#读取10X数据
pbmc.data <- Read10X(data.dir = "~/Downloads/filtered_gene_bc_matrices/hg19/")
#构建Seurat对象,这里会有个初筛,保证所有基因在至少3个细胞中表达(0.1%细胞数),保证每个细胞至少能检测到200个基因。
pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")
2. QC选择细胞进行后续分析
#UMI (Unique Molecular Identifier)是一段固定长度的随机小片段,用以区别不同的PCR扩增模板
#barcode,标记不同的样品或者细胞
#根据细胞内基因数以及线粒体基因所占比例进行过滤细胞
#提取线粒体DNA并计算其比例
mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)
#使用AddMetaData函数,将上面合并到pbmc对象中,方便后续的QC
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)
#过滤细胞,细胞数控制在200~2500之间,如果不希望设定阈值,用-Inf表示,线粒体DNA所占比例控制在0.05以内
pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
3.对数据进行标准化
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
4.在单细胞水平上检测变异基因
#这里参数的设置根据数据的类型、样本的异质性以及标准化的方法,存在差异
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
#查看筛选出来的变异基因的数量,一般是2000多
length(x = pbmc@var.genes)
5.除去不想要的变异来源
#这里就包括除去技术水平的噪音、批次效应、细胞状态等
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
6.PCA降维处理
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5)
#通过下面的几种function可以查看定义特定PC的细胞和基因
PrintPCA(object = pbmc, pcs.print = 1:5, genes.print = 5, use.full = FALSE)
VizPCA(object = pbmc, pcs.use = 1:2)
PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)
#PCHeatmap这个功能可以很清楚的展示一个数据集的异质性,同时对于确定下游分析中用哪一个PC有着很大的帮助。对于探索相关的基因集合有着很大的帮助
PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced = TRUE, label.columns = FALSE)#用PC1作图
PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE, label.columns = FALSE, use.full = FALSE)#用PC1~12作图
7.确定在统计学上显著的PC
#这一步耗时很长
pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
#这一步可以看一下每一个PC的相关性和显著性,实线在虚线上方
JackStrawPlot(object = pbmc, PCs = 1:12)
#确定需要选取的PC,通过曲线的走势
PCElbowPlot(object = pbmc)
8.对细胞进行聚类
#resolution这一参数一般设定在0.6~1.2之间(细胞数3K左右),聚类结果放在object@ident中
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.6, print.output = 0, save.SNN = TRUE)
9.进行非线性降维(tSNE)
#tSNE的输入,建议和之前聚类所用的PC一样
pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)
#这一步,你可以添加参数do.label=T,显示每一类群的label
TSNEPlot(object = pbmc)
#保存这一对象
saveRDS(pbmc, file = "~/Projects/datasets/pbmc3k/pbmc_tutorial.rds")
10. 找差异表达的基因(聚类marker)
#找到cluster1所对应的marker(positive+negative)
#ident.1 用来指定cluster
#min.pct 用来指定特定基因需要在%的细胞中被检测到
cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
print(x = head(x = cluster1.markers, n = 5))
#找到区分cluster5和cluster0,3的marker
cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
print(x = head(x = cluster5.markers, n = 5))
#找到区分所有cluster的marker
#只展示positive
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
#可视化marker的表达
VlnPlot(object = pbmc, features.plot = c("MS4A1", "CD79A"))
#可以选择用raw UMI counts
VlnPlot(object = pbmc, features.plot = c("NKG7", "PF4"), use.raw = TRUE, y.log = TRUE)
#用PCA或者tSNE图可视化基因的表达
FeaturePlot(object = pbmc, features.plot = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"), cols.use = c("grey", "blue"), reduction.use = "tsne")
#针对给定的细胞和基因,用DoHeatmap函数画出表达热图
#这里选定的是每个cluster中表达量前十的基因
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
#slim.col.label= TRUE表示,只展示cluster的ID而不是名字
DoHeatmap(object = pbmc, genes.use = top10$gene, slim.col.label = TRUE, remove.key = TRUE)
11.更改cluster的名字(确定细胞类型的前提下)
current.cluster.ids <- c(0, 1, 2, 3, 4, 5, 6, 7)
new.cluster.ids <- c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object = pbmc, do.label = TRUE, pt.size = 0.5)
12.进一步细分细胞类型
#如果你修改分辨率参数resolution或者改变PC的个数,有可能会使得原来的细胞类群进一步的细分。这样你可以进一步探究在这个细分的cluster中,两者之间的marker有什么区别
#在此之前,需要对新的聚类群进行重命名,不然会导致之前的结果被覆盖
#分辨率为0.6
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
#分辨率为0.8
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.8, print.output = FALSE)
#出图
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6", no.legend = TRUE, do.label = TRUE)
plot_grid(plot1, plot2)