最近有点时间,打算复现一篇单细胞的文章。记于2023/03/28
参照文章:Single-cell RNA landscape of the osteoimmunology microenvironment in periodontitis (thno.org)
大概就是对牙周炎组织做了一个单细胞测序;样本有HCs组(four healthy controls)、PDs( five patients with severe chronic periodontitis )、PDTs(three patients with severe chronic periodontitis after initial periodontal therapy within 1 month )。共计12个文库样本。由于是同一个组织不同处理组,所以我的基本处理思路:
按原始流程跑下来之后,根据不同的组的细胞降维聚类结果判断是否具有批次效应;在去除批次效应之后,又重新降维聚类,并走完接下来的Clsuter注释等。
直接从下载该文章的GEO数据
这里我下载的是GSE171213_RAW.tar,后面需要自己整合。当然也可以下载文章的原始数据,使用cellranger从头跑一遍,具体流程代码示例:单细胞实战(四) Cell Ranger流程概览 (qq.com)
下载并解压GSE171213_RAW.tar文件后,得到
tar -xvf GSE171213_RAW.tar
#GSM5220920_HC1.cellname.list.txt
#GSM5220920_HC1.counts.tsv
#GSM5220921_HC2.cellname.list.txt
#GSM5220921_HC2.counts.tsv
#GSM5220922_HC3.cellname.list.txt
#GSM5220922_HC3.counts.tsv
#GSM5220923_HC4.cellname.list.txt
#GSM5220923_HC4.counts.tsv
#GSM5220924_PD1.cellname.list.txt
#GSM5220924_PD1.counts.tsv
#GSM5220925_PD2.cellname.list.txt
#GSM5220925_PD2.counts.tsv
#GSM5220926_PD3.cellname.list.txt
#GSM5220926_PD3.counts.tsv
#GSM5220927_PD4.cellname.list.txt
#GSM5220927_PD4.counts.tsv
#GSM5220928_PD5.cellname.list.txt
#GSM5220928_PD5.counts.tsv
#GSM5220929_PDT1.cellname.list.txt
#GSM5220929_PDT1.counts.tsv
#GSM5220930_PDT2.cellname.list.txt
#GSM5220930_PDT2.counts.tsv
#GSM5220931_PDT3.cellname.list.txt
#GSM5220931_PDT3.counts.tsv
创建Seurat Object对象并整合数据
##大致流程:读取--创建SeuratObject对象--数据质控--标准化、识别高边基因、归一化表达矩阵--降维聚类--类型注释
library(Seurat)
library(dplyr)
library(edgeR)
library(stringr)
library(patchwork)
library(ggpubr)
rm(list = ls())
#set.seed(12123) #这里最好设置随机种子,保证每次操作的可重复性
folders = list.files(pattern = "*tsv")
samples_name = strsplit(folders, "_") %>% sapply(function(x)x[2]) %>% strsplit("\\.") %>% sapply(function(x)x[1])
##samples_name : "HC1" "HC2" "HC3" "HC4" "PD1" "PD2" "PD3" "PD4" "PD5" "PDT1" "PDT2" "PDT3"
##使用lapply函数对每一个组创建一个单独的Seurat object对象。
sceList = lapply(folders,function(folder){
CreateSeuratObject(counts = read.table(folder,header = T,sep = "\t",row.names = 1),
min.cells = 60,min.features = 200,project = strsplit(folder, "_") %>%
sapply(function(x)x[2]) %>% strsplit("\\.") %>% sapply(function(x)x[1]))
})
names(sceList) =samples_name ##修改名称
##对每个object的meta.data添加分组信息,例如:
sceList[[1]]@meta.data$group <- "HC"
sceList[[2]]@meta.data$group <- "HC"
sceList[[3]]@meta.data$group <- "HC"
sceList[[4]]@meta.data$group <- "HC"
sceList[[5]]@meta.data$group <- "PD"
sceList[[6]]@meta.data$group <- "PD"
sceList[[7]]@meta.data$group <- "PD"
sceList[[8]]@meta.data$group <- "PD"
sceList[[9]]@meta.data$group <- "PD"
sceList[[10]]@meta.data$group <- "PDT"
sceList[[11]]@meta.data$group <- "PDT"
sceList[[12]]@meta.data$group <- "PDT"
###整合
sce.big = merge(sceList[[1]],sceList[2:length(sceList)],
add.cell.ids = samples_name)
save(sce.big,file = 'sce.big.merge.teeth.Rdata') ##保存
###查看每个分组中的细胞数目
table(sce.big$orig.ident)
[result]:
HC1 HC2 HC3 HC4 PD1 PD2 PD3 PD4 PD5 PDT1 PDT2 PDT3
3781 2637 4485 4534 4503 3676 4290 5398 4361 6677 8320 4387
质控并初步判断是否存在批次效应
###QC,这里我是用的文章的标准:每个细胞的线粒体含量:≤ 40%;每个细胞的基因数目:≥ 200
sce.big[["percent.mt"]] = PercentageFeatureSet(sce.big,pattern = "^MT-")
head(sce.big@meta.data,5)
sce.big = subset(sce.big,subset = nFeature_RNA>200 & percent.mt < 40)
VlnPlot(sce.big,pt.size = 0,features = c ("nFeature_RNA","percent.mt"),ncol=2)
##查看是否有批次效应;这里使用的是Seurat的步骤:NormalizeData ==> FindVariableFeatures ==> ScaleData;
sce.big = NormalizeData(sce.big, normalization.method = "LogNormalize", scale.factor = 10000)%>%
FindVariableFeatures (selection.method = "vst", nfeatures = 2000)%>%ScaleData()
sce.big = RunPCA(sce.big,verbose = F)
ElbowPlot(sce.big, ndims = 50) ##用于判断接下来使用的Dims个数
pc.num=1:30
sce.big = sce.big %>% RunTSNE(dims=pc.num)%>% RunUMAP(dims = pc.num)
sce.big = sce.big %>% FindNeighbors(dims = pc.num)%>% FindClusters()
DimPlot(sce.big,label = T)
DimPlot(sce.big,group.by = "orig.ident")
DimPlot(sce.big,group.by = "group")
DimPlot(sce.big,group.by = "orig.ident",split.by = "orig.ident",ncol=4)
这么一看,未作CCA和harmony前,未出现明显的批次效应。尽管如此,我们还是走一遍去批次的流程。常用的方法有Seurat包的CCA以及Harmony,相比而言,CCA容易出现过矫正的情况,这里我们使用Harmony(根据文章的来)
去除批次效应
###去除批次效应
#使用harmony去除批次效应
cellinfo = subset(sce.big@meta.data,select = c("orig.ident","group","nCount_RNA","nFeature_RNA"))
scRNA = CreateSeuratObject(sce.big@assays$RNA@counts,meta.data = cellinfo)#重新创建Serurat object
#这里我们使用SCTranscform代替Seurat流程中的NormalizeData ==> FindVariableFeatures ==> ScaleData步骤
scRNA <- SCTransform(sce.big)
scRNA = RunPCA(scRNA,npcs=50,verbose=FALSE)
scRNA = RunHarmony(scRNA,group.by.vars = "orig.ident",
assay.use = "SCT",
max.iter.harmony = 20)
##使用Harmony去除批次效应,批次的来源(group.by.vars)为orig.ident,矩阵(assay)为“SCT”
ElbowPlot(scRNA, ndims = 50)
pc.num=1:10
scRNA = RunTSNE(scRNA,reduction = "harmony",dims=pc.num)%>%
RunUMAP(reduction="harmony",dims = pc.num)%>%FindNeighbors()%>%FindClusters(resolution = 0.5)
debatch1=DimPlot(scRNA,group.by = "orig.ident")
debatch2=DimPlot(scRNA,group.by = "group",split.by = "group",ncol=4)
debatch3=DimPlot(scRNA,label = TRUE) ##可以看到使用resolution=0.5的时候,cluster有28个,该值是否合适有待商榷!
design = "AA
BC"
wrap_plots(debatch2, debatch1, debatch3, design = design)
##使用clustree查看不同resolution值的分群情况。
library(cluster)
library(clustree)
scRNA = FindClusters(scRNA,resolution =c(seq(.1,1.5,.2))) #这里resolution从0.1到1.5,步长0.2,共计8个值。
clustree_plot = clustree(scRNA@meta.data, prefix = "SCT_snn_res.")
colnames(scRNA@meta.data)
[result]:
1] "orig.ident" "nCount_RNA" "nFeature_RNA" "group" "nCount_SCT" "nFeature_SCT"
[7] "SCT_snn_res.0.5" "seurat_clusters" "SCT_snn_res.0.1" "SCT_snn_res.0.4" "SCT_snn_res.0.6" "SCT_snn_res.0.8"
[13] "SCT_snn_res.1" "SCT_snn_res.1.2" "SCT_snn_res.1.4" "SCT_snn_res.1.6" "SCT_snn_res.0.3" "SCT_snn_res.0.7"
[19] "SCT_snn_res.0.9" "SCT_snn_res.1.1" "SCT_snn_res.1.3" "SCT_snn_res.1.5"
Idents(scRNA) <- "SCT_snn_res.0.1" ##从Clustree的结果看,我直接选择resolution=0.1,但在写这篇简书的时候发现选择resolution=0.3可能会更好。
DimPlot(scRNA,label = TRUE) ##图没有贴出来,得到12个Clsuters
从Clustree的结果看,我当时直接选择resolution=0.1,但在写这篇简书的时候发现选择resolution=0.3可能会更好,但后面流程都是resolution=0.1,就这样吧,仅仅作为一个流程参考罢了。
寻找Marker基因并注释。参考
##按照自己分析的,寻找不同cluster中的marker基因。
#如果是单个条件的单个样本:使用以FindAllMarkers(),即将每个类群与所有其他类群进行比较,以确定潜在的marker genes
scRNA.markers= FindAllMarkers(scRNA,only.pos = TRUE,min.pct = 0.1,logfc.threshold=0.25,test.use = "wilcox")
top3 <- scRNA.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC) ##每个Cluster选择3个top genes
Doheatmap = DoHeatmap(subset(scRNA,downsample = 300), features =top3$gene) + NoLegend()
#如果多个不同条件的样本:使用FindConservedMarkers()寻找多个样本中的保守基因作为markers genes
DefaultAssay(scRNA) <- "RNA" ##这里我们使用原始计数而不是聚合数据。
cluster_marker_gene = mclapply(0:11, function(i){
cluster_i = FindConservedMarkers(scRNA, ident.1 = i, grouping.var = "group", verbose = FALSE) %>% as.data.frame()
cluster_i$ident = i
}, mc.cores = 12)
cluster_marker_gene_df = do.call(rbind, cluster_marker_gene)
head(cluster_marker_gene_df,3)
根据文章提供的marker gene作注释#1
##按照文章的marker基因查看每个cluster中基因的表达情况。
markers.to.plot <- c("TRAC", "MS4A1", "IGHG1", "PECAM1", "FCGR3B", "MS4A6A", "COL1A1", "TPSAB1", "KRT6A","LTF")
#Dot图
DotPlot(scRNA, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 8) +RotatedAxis()
#Feature图
FeaturePlot(scRNA, features = markers.to.plot, max.cutoff = 3,cols = c("grey", "red"),min.cutoff = 'q9')
#小提琴图
plot = VlnPlot(scRNA, features = markers.to.plot, pt.size = 0, combine = FALSE)
wrap_plots(plots = plot, ncol = 4)+plot_layout(guides = "collect")
可以看到使用文章的marker对于Cluster的注释还是不错的,不同的marker 在不同的Cluster表达情况区分明显。
根据文章提供的marker gene作注释#2
scRNA <- RenameIdents(scRNA,
`0` = "T cell",
`1` = "Endotheliad",
`2` = "B cell",
`3` = "Neutrophil",
`4` = "Plasma", `5` = "Monocytic",
`6` = "Plasma", `7` = "Epitheial",
`8` = "Plasma", `9` = "Mast cell",
`10` = "Fibroblasts",
`11` = "unknow")
DimPlot(scRNA, label = TRUE,split.by = "group")
DimPlot(scRNA, label=TRUE)
复现文章中Boxplot图,对每一个细胞Cluster统计不同处理组中细胞变化情况。
##画不同的cluster在不同组的分布(boxplot)
# group.add = c(rep("HC",4),rep("PD",5),rep("PDT",3))
my_comparisons <- list(c("HC", "PD"), c("HC", "PDT"), c("PD", "PDT"))
cell <- list()
plot3 = list()
for (i in 1:length(cellkind)){
cell_i = subset(scRNA,idents =cellkind[i])$orig.ident %>% table() %>% prop.table() %>% as.data.frame()
cell_i$group = gsub('.{1}$', '', cell_i$.)
plot3_i = ggplot(cell_i,aes(x=group,y=Freq,fill=group))+geom_boxplot()+theme_classic()+
labs(title = cellkind[i])+xlab(NULL)+
stat_compare_means(comparisons = my_comparisons,method = "t.test",label = "p.format",size = 2)
cell[[i]] <- list(cell_i)
plot3[[i]] = plot3_i
}
rm(plot3_i)
rm(cell_i)
wrap_plots(plots = plot3, ncol = 4)+plot_layout(guides = "collect") ##整理列表的输入和输出
可以看到和文章中总体趋势一致,但在P值和显著性水平上仍是和文章有部分出入。
细胞轨迹分析
##细胞轨迹分析:packages安装。
library(devtools)
install_github("velocyto-team/velocyto.R")
devtools::install_github('cole-trapnell-lab/monocle3')
报错:
Error: Failed to install 'unknown package' from GitHub:
An unknown option was passed in to libcurl
报错的原因应该是Curl包版本的问题。服务器上的Rstudio server链接在public下的R版本,一些R包安装起来就和系统的一些东西起冲突,如果是自己的环境下,一个conda直接解决,可惜root不在我。
-------记于2023/3/31 00:47