相关链接
0. 准备数据
#install
install.packages("devtools")
devtools::install_github("pcahan1/singleCellNet")
library(singleCellNet)
#download the data
#training metadata, https://s3.amazonaws.com/cnobjects/singleCellNet/examples/sampTab_TM_053018.rda
#training expression matrix , https://s3.amazonaws.com/cnobjects/singleCellNet/examples/expTM_Raw_053018.rda
#query metadata, https://s3.amazonaws.com/cnobjects/singleCellNet/examples/stDat_beads_mar22.rda
#query expression matrix, https://s3.amazonaws.com/cnobjects/singleCellNet/examples/6k_beadpurfied_raw.rda
#human-mouse orthologs.https://s3.amazonaws.com/cnobjects/singleCellNet/examples/human_mouse_genes_Jul_24_2018.rda
#loading training data MOUSE
stTM <- utils_loadObject(fname = "sampTab_TM_053018.rda") #metadata
expTMraw <- utils_loadObject(fname = "expTM_Raw_053018.rda") #expression matrix
#loading query data HUMAN
stQuery <- utils_loadObject(fname = "stDat_beads_mar22.rda") #metadata
expQuery <- utils_loadObject(fname = "6k_beadpurfied_raw.rda") #expression matrix
#Ortholog conversion for cross species analysis
download.file("https://s3.amazonaws.com/cnobjects/singleCellNet/examples/human_mouse_genes_Jul_24_2018.rda", "human_mouse_genes_Jul_24_2018.rda")
oTab <- utils_loadObject(fname = "human_mouse_genes_Jul_24_2018.rda")
aa = csRenameOrth(expQuery = expQuery, expTrain = expTMraw, orthTable = oTab)
#> query genes in ortholog table = 15532
#> training genes in ortholog table and query data = 14550
expQueryOrth <- aa[['expQuery']]
expTrainOrth <- aa[['expTrain']]
#若为同一物种
commonGenes<-intersect(rownames(expTrain), rownames(expQuery))
expTrain <- expTrain[commonGenes, ]
expQuery <- expQuery[commonGenes, ]
#seurat 对象的导入
seuratfile <- extractSeurat(sc, exp_slot_name = "counts")
sampTab = seuratfile$sampTab
expDat = seuratfile$expDat
1. Training the data 建立分类器
We use the same number of cells per cell type, i.e. balanced data, to train Top-Pair Random Forest classifier. The remaining of the data or the held-out data will be used as validation data to evaluate the performance of the TP-RF classifier. Empirically we have found 50 cells to be a minimal cell number to create a classifier with good performance, however it may vary depend on the quality of your reference data, so it is really important to assess the performance of your classifier before proceeding to query your sample of interest.
stList<-splitCommon(sampTab = stTM, ncells = 50, dLevel = "newAnn")#以newAnn分类细胞
alveolar macrophage : 62
B cell : 3134
bladder urothelial cell : 759
bladder_mesenchymal : 859
cardiac muscle cell : 60
cardiac_fibroblast : 222
chondrocyte-like : 165
endocardial cell : 52
endothelial cell : 1890
erythroblast : 152
erythrocyte : 74
granulocyte : 520
hematopoietic precursor cell : 117
hepatocyte : 882
keratinocyte : 1203
kidney capillary endothelial cell : 117
kidney proximal straight tubule epithelial cell : 618
kidney_duct_epithelial : 355
late pro-B cell : 141
limb_mesenchymal : 540
luminal epithelial cell of mammary gland : 137
lung_mammary_stromal : 2072
macrophage : 1340
mammary_basal_cell : 115
monocyte : 370
natural killer cell : 600
neuroendocrine cell : 282
skeletal muscle satellite cell : 190
T cell : 1823
tongue_basal_cell : 1726
trachea_epithelial : 434
trachea_mesenchymal : 3925
stTrain<-stList[[1]]
expTrain <- expTrainOrth[,stTrain$cell]
stTest <- stList[[2]]
expTest <- expTrainOrth[,stTest$cell]
This training step includes:
- normalize and log-transform & scale the training data,
- find top gene pairs and transform the training data into a binary matrix,
- train the Top-pair Random Forest classifier
#- If you increase nTopGenes and nTopGenePairs, you may get a even better classifier performance on query data!
class_info<-scn_train(stTrain = stTrain, expTrain = expTrain, nTopGenes = 10, nRand = 70, nTrees = 1000, nTopGenePairs = 25, dLevel = "newAnn", colName_samp = "cell")
#a list containing normalized expression data, classification gene list, cnPRoc
2. 评价分类器
#apply heldout data
system.time(classRes_val_all <- scn_predict(class_info[['cnProc']], expTest, nrand = 50))
#> Loaded in the cnProc
#> All Done
#> user system elapsed
#> 0.208 0.012 0.220
#assessment
tm_heldoutassessment <- assess_comm(ct_scores = classRes_val_all, stTrain = stTrain, stQuery = stTest, dLevelSID = "cell", classTrain = "newAnn", classQuery = "newAnn")
plot_PRs(tm_heldoutassessment)
3. 进行分类
crPBMC <- scn_predict(class_info[['cnProc']], expQueryOrth, nrand = 50)
#需要对crPBMC过滤才能完成下一步
test = crPBMC[,colnames(crPBMC) %in% colnames(expQueryOrth)]
stQuery <- assign_cate(classRes = test, sampTab = stQuery, cThresh = 0.5) #选择classification score higher than 0.5
4. 可视化
#create labels for classification heatmap
sgrp<-as.vector(stQuery$description)
names(sgrp)<-rownames(stQuery)
grpRand<-rep("rand", 50)
names(grpRand)<-paste("rand_", 1:50, sep='')
sgrp<-append(sgrp, grpRand)
sc_hmClass(crPBMC, sgrp, max=5000, isBig=TRUE, cCol=F, font=8)
sc_violinClass(sampTab = stQuery,classRes = crPBMC, cellIDCol = "sample_name", dLevel = "description", ncol = 9)
sc_violinClass(sampTab = stQuery, classRes = crPBMC, cellIDCol = "sample_name", dLevel = "description", ncol = 9, sub_cluster = "B cell")
# attribution plot
plot_attr(sampTab = stQuery, classRes = crPBMC, nrand=50, sid="sample_name", dLevel="description")
#Visualize top pair gene expression
gpTab <- compareGenePairs(query_exp = expQueryOrth, training_exp = expTrainOrth, training_st = stTrain, classCol = "newAnn", sampleCol = "cell", RF_classifier = class_info$cnProc$classifier, numPairs = 20, trainingOnly = FALSE)
#> [1] "All Good"
sgrp<-as.vector(stQuery$prefix)
names(sgrp)<-rownames(stQuery)
train <- findAvgLabel(gpTab, stTrain = stTrain, dLevel = "newAnn")
sgrp <- append(sgrp, train)
hm_gpa_sel(gpTab, genes = class_info$cnProc$xpairs, grps = sgrp, maxPerGrp = 5)
5. 插入seurat 对象中进行可视化
sc@meta.data$category = stQuery$category
DimPlot(sc,group.by = "category",label = T)