最近在处理小鼠全脑的单细胞数据,归纳总结下细胞类型鉴定的过程吧。我用的算法是Seurat label transfer function, 抽取了Tubula Muris部分已知数据,用这个算法进行了鉴定,准确率可以达到98%。后来我做细胞类型鉴定用的都是这个算法加已知marker双重验证的方法。
1.首先下载两个小鼠全脑的数据
a.Tabula Muris
下载的是Tabula Muris中脑的数据
facs_Brain_Non-Myeloid_seurat_tiss.Robj
facs_Brain_Myeloid_seurat_tiss.Robj
其中Myeloid主要是Microglia 以及Macrophage细胞类型
Non-Myeloid主要是Neuron, Astrocyte, Oligodendrocyte, Endothelial等
b.Mouse Cell Atlas
找这套数据的脑的数据找了我半天
后来发现可以在他们文章里找到数据的下载链接
数据存储在figshare
但是我的数据 我是从Tabula muris github网址上的data路径中下载下来的数据
从Tabula muris github网址中可以找到Seurat处理好的MCA的数据可以直接导入到R里面分析,很方便
这个压缩包里包含两个文件
MCA_CellAssignments.csv
MCA_All-batch-removed-assignments.csv
我可纳闷了为啥不添加细胞类型信息 有毒
但是这个下载下来的数据是没有细胞类型的,需要自己到这个网址再下载一下,我下载的是MCA_CellAssignment.csv
,然后用R代码处理一下就可以得到对应的细胞类型。
代码如下
MCA_exp <- readRDS("MCA_merged_mat.rds")
MCA_celltypeinfo <- read.csv("MCA_CellAssignments.csv")
MCA_cellname <- read.csv("MCA_All-batch-removed-assignments.csv")
index <- match(MCA_cellname$Cell.name, MCA_celltypeinfo$Cell.name)
selected_cellname <- MCA_celltypeinfo[na.omit(index), ]
dim(selected_cellname)
brain_cellname <- selected_cellname[selected_cellname$Tissue=="Brain", ]
brain_exp <- MCA_exp[, as.character(brain_cellname$Cell.name)]
all.equal(as.character(brain_cellname$Cell.name), colnames(brain_exp))
MCA_brain_rds <- list()
MCA_brain_rds$exp <- brain_exp
MCA_brain_rds$meta.data <- brain_cellname
saveRDS(MCA_brain_rds, "MCA_brain_exp.rds")
2.构建Seurat Reference Object
这里遇到一个问题 就是用SCTtransform的时候 没办法运行成功 后来我用NormalizeData方法就可以的
a.Tabula Muris
load("facs_Brain_Non-Myeloid_seurat_tiss.Robj")
selected_cells <- rownames(tiss@meta.data)
Non_Myeloid_exp <- tiss@raw.data[, selected_cells]
Non_Myeloid_label <- tiss@meta.data$cell_ontology_class
all.equal(colnames(Non_Myeloid_exp), rownames(tiss@meta.data))
dim(Non_Myeloid_exp) #23341 3401
# write.csv(tiss@meta.data, "Tabula_Muris_Non_Myeloid_metadata1.csv")
load("facs_Brain_Myeloid_seurat_tiss.Robj")
selected_cells <- rownames(tiss@meta.data)
Myeloid_exp <- tiss@raw.data[, selected_cells]
Myeloid_label <- tiss@meta.data$cell_ontology_class
all.equal(colnames(Myeloid_exp), rownames(tiss@meta.data))
dim(Myeloid_exp) #23341 4455
# write.csv(tiss@meta.data, "Tabula_Muris_Myeloid_metadata1.csv")
## 我在表格中去掉了Rik结尾的基因 以及4-Sep这样子的基因 基因名字写在了"rownames.csv"
## 是因为这些基因感觉没什么实质性作用 可以跳过这里 不做任何处理
genes <- as.character(read.csv("rownames.csv", row.names= 1)$x)
Brain_exp <- cbind(Non_Myeloid_exp, Myeloid_exp)
Cell_ontology_class <- data.frame(celltype = c(Non_Myeloid_label, Myeloid_label))
rownames(Cell_ontology_class) <- colnames(Brain_exp)
Brain_exp <- Brain_exp[genes, ]
Brain_exp <- Brain_exp[rowSums(Brain_exp>5) >= 20 & rowSums(Brain_exp) >= 20, ]
Brain_exp <- na.omit(Brain_exp)
Tabula_Muris_rds <- list()
Tabula_Muris_rds$exp <- Brain_exp
Tabula_Muris_rds$celltype <- Cell_ontology_class$celltype
saveRDS(Tabula_Muris_rds, "Tabula_Muris_rawdata_with_cellinfo.rds")
# 2) 构建NormalizeData的ref.seurat用来做reference data
ref_exp <- Brain_exp
ref.seurat <- CreateSeuratObject(counts = ref_exp)
ref.seurat <- NormalizeData(ref.seurat, verbose = FALSE)
ref.seurat <- FindVariableFeatures(ref.seurat, selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
ref.seurat <- ScaleData(ref.seurat)
ref.seurat <- RunPCA(object = ref.seurat, npcs = 30, verbose = FALSE)
ref.seurat$celltype <- Cell_ontology_class$celltype
saveRDS(ref.seurat, file = "Tabula_Muris_NormalizeData_ref_seurat.rds")
# 3) 选取了部分数据 做reference 推断 准确率可以达到98.8%
index <- sample(colnames(Brain_exp), 500)
test_data <- Brain_exp[, index]
test_celltype <- Cell_ontology_class[index, 1]
test.seurat <- CreateSeuratObject(counts = test_data)
test.seurat <- NormalizeData(test.seurat, verbose = FALSE)
test.seurat <- FindVariableFeatures(test.seurat, selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
test.seurat <- ScaleData(test.seurat)
test.seurat <- RunPCA(object = test.seurat, npcs = 30, verbose = FALSE)
test.seurat$original_celltype <- test_celltype
test.anchors <- FindTransferAnchors(reference = ref.seurat,
query = test.seurat,
dims = 1:30)
predictions <- TransferData(anchorset = test.anchors,
refdata = ref.seurat$celltype,
dims = 1:30)
test.seurat <- AddMetaData(test.seurat, metadata = predictions)
b.Mouse Cell Atlas
MCA_exp <- readRDS("MCA_merged_mat.rds")
MCA_celltypeinfo <- read.csv("MCA_CellAssignments.csv")
MCA_cellname <- read.csv("MCA_All-batch-removed-assignments.csv")
index <- match(MCA_cellname$Cell.name, MCA_celltypeinfo$Cell.name)
selected_cellname <- MCA_celltypeinfo[na.omit(index), ]
dim(selected_cellname)
brain_cellname <- selected_cellname[selected_cellname$Tissue=="Brain", ]
brain_exp <- MCA_exp[, as.character(brain_cellname$Cell.name)]
all.equal(as.character(brain_cellname$Cell.name), colnames(brain_exp))
MCA_brain_rds <- list()
MCA_brain_rds$exp <- brain_exp
MCA_brain_rds$meta.data <- brain_cellname
saveRDS(MCA_brain_rds, "MCA_brain_exp.rds")
# 2.构建seurat reference
library(Seurat)
ref_exp <- brain_exp
ref.seurat <- CreateSeuratObject(counts = ref_exp)
ref.seurat <- NormalizeData(ref.seurat, verbose = FALSE)
ref.seurat <- FindVariableFeatures(ref.seurat, selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
ref.seurat <- ScaleData(ref.seurat)
ref.seurat <- RunPCA(object = ref.seurat, npcs = 30, verbose = FALSE)
ref.seurat$celltype <- as.character(brain_cellname$Annotation)
saveRDS(ref.seurat, file = "MCA_NormalizeData_ref_seurat.rds")
写在最后 佛系更文 很开心 终于有一百个粉丝了开心快乐的一天啊