建立Seurat对象
library(dplyr)
library(Seurat)
library(patchwork)
library(SingleR)
library(celldex)
library(pheatmap)
# 加载单细胞数据集(三个文件:barcodes.tsv.gz features.tsv.gz matrix.mtx.gz)
pbmc.data <- Read10X(data.dir = "outs/filtered_feature_bc_matrix")
# 使用未标准化的原始数据初始化Seurat对象
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "treat", min.cells = 3, min.features = 200)
pbmc
#An object of class Seurat
#28782 features across 10689 samples within 1 assay
#Active assay: RNA (28782 features, 0 variable features)
Read10x函数可以直接读取cellRanger处理过的10x单细胞测序数据文件,返回表达矩阵,该矩阵中的值表示在每个细胞(列)中检测到的每个特征(行)的数量。
CreateSeuratObject从Read10x读取的原始数据创建Seurat对象。
参数说明:
- counts : 一个类似表达矩阵的对象,其中包含非标准化的数据,单元格为列,要素为行,或者是一个派生对象
- project : Seurat对象的项目名称
- min.cells : 至少细胞数量中检测到了基因
- min.features : 至少检测到基因表达数的细胞。
数据QC 与细胞过滤
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc[["percent.ribo"]] <- PercentageFeatureSet(pbmc, pattern = "^RPL|^RPS")
violin <- VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.ribo"), pt.size = 0.01,ncol = 4) + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())
ggsave("QC/vlnplot_before_qc.png", plot = violin, width = 12, height = 6)
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
pdf("FeatureScatter.pdf")
plot1 + plot2
dev.off()
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 6000 & percent.mt < 10 & percent.ribo < 20)
pbmc
#An object of class Seurat
#28782 features across 8285 samples within 1 assay
#Active assay: RNA (28782 features, 0 variable features)
Normalizing the data
#Normalized values are stored in pbmc[["RNA"]]@data.
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
Identification of highly variable features (feature selection)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
pdf("VariableFeature.pdf")
plot1 / plot2
dev.off()
Scaling the data
Next, we apply a linear transformation (‘scaling’) that is a standard pre-processing step prior to dimensional reduction techniques like PCA. The `[ScaleData()] function:
- Shifts the expression of each gene, so that the mean expression across cells is 0
- Scales the expression of each gene, so that the variance across cells is 1
- This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate
- The results of this are stored in
pbmc[["RNA"]]@scale.data
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
Perform linear dimensional reduction
Next we perform PCA on the scaled data. By default, only the previously determined variable features are used as input, but can be defined using features argument if you wish to choose a different subset.
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
pdf("PCA.pdf")
VizDimLoadings(pbmc, dims = 1:4, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
dev.off()
Determine the ‘dimensionality’ of the dataset
To overcome the extensive technical noise in any single feature for scRNA-seq data, Seurat clusters cells based on their PCA scores, with each PC essentially representing a ‘metafeature’ that combines information across a correlated feature set.
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
pdf("dimensionality.pdf")
ElbowPlot(pbmc)
JackStrawPlot(pbmc, dims = 1:20)
dev.off()
Cluster the cells
pbmc <- FindNeighbors(pbmc, dims = 1:18)
pbmc <- FindClusters(pbmc, resolution = 0.5)
pbmc <- RunUMAP(pbmc, dims = 1:18)
pdf("cluster_umap.pdf")
DimPlot(pbmc, reduction = "umap")
saveRDS(pbmc, file = "pbmc_tutorial.rds")
dev.off()
Finding differentially expressed features (cluster biomarkers)
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>% group_by(cluster) %>% slice_max(n = 2, order_by = avg_log2FC)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC) -> top5
pdf("DoHeatmap.pdf")
DoHeatmap(pbmc, features = top5$gene) + NoLegend()
dev.off()
Assigning cell type identity to clusters
#load(file="DATABASE/CellTypeAnno/celldex_hpca.RData")
hpca.ref <- HumanPrimaryCellAtlasData()
pred <- SingleR(test = pbmc@assays$RNA@data, ref = hpca.ref, labels = hpca.ref$label.main, clusters = pbmc@active.ident)
table(pred$labels)
pdf("cell_type.pdf")
plotScoreHeatmap(pred)
plotDeltaDistribution(pred, ncol = 3)
new.cluster.ids <- pred$labels
names(new.cluster.ids) <- levels(pbmc)
UMAPPlot(object = pbmc, pt.size = 0.5, label = TRUE)
dev.off()