seurat单细胞数据处理小技巧

1 亚群合并

当有几类亚群同属于某类细胞时，比如CD4+ T细胞和CD8+ T细胞均属于T细胞，想要将他们合并在一起时，可以使用此代码。

#接seurat标准流程代码
#命名pbmc为sce，表明其是seurat对象
sce=pbmc

Idents(sce)#细胞身份，属于哪个亚群
levels(sce)#获取sce的亚群名
> [1] "Naive CD4 T"  "CD14+ Mono"   "Memory > CD4 T"
> [4] "B"            "CD8 T"        "FCGR3A+ > Mono"
> [7] "NK"           "DC"           "Platelet"
#可以看到Naive CD4 T，Memory CD4T及CD8 T都属于T细胞

head(sce@meta.data) #可以看到meta.data中多了seurat_clusters列，由0-9构成。
# 方法1 

#根据levels(sce)可将要合并的亚群写作如下
new.cluster.ids <- c("T", "Mono", "T", 
                     "B", "T", "Mono",
                     "T", "DC", "Platelet")

names(new.cluster.ids) <- levels(sce)
#更改前
p1<-DimPlot(pbmc, reduction = 'umap', 
            label = TRUE, pt.size = 0.5) + NoLegend()

#重命名亚群
sce <- RenameIdents(sce, new.cluster.ids)
#更改后
p2<-DimPlot(sce, reduction = 'umap', 
        label = TRUE, pt.size = 0.5) + NoLegend()


#方法2
根据亚群对应关系，创建一个向量
cluster2celltype <- c("0"="T",
                  "1"="Mono", 
                  "2"="T", 
                  "3"= "B", 
                  "4"= "T", 
                  "5"= "Mono",
                  "6"= "T", 
                  "7"= "DC", 
                  "8"= "Platelet") 
#unname去掉对象的名（行名列名）
sce[['cell_type']] = unname(cluster2celltype[sce@meta.data$seurat_clusters])
#绘制散点图，reduction：使用哪种降维。如果没有指定，首先搜索 umap，然后是 tsne，然后是 pca；label:是否标记分群
DimPlot(sce, reduction = 'umap', group.by = 'cell_type',
        label = TRUE, pt.size = 0.5) + NoLegend()#删除图例

p3<-DimPlot(sce, reduction = 'umap', group.by = 'cell_type',
        label = TRUE, pt.size = 0.5) + NoLegend()#删除图例
p1+p2+p3#此函数需要library(dplyr)

推荐使用第二种方法，这样不更改原始亚群分群，只是在metadata中增加了一列

2 提取子集

当我们想把表达感兴趣基因的细胞提取出来单独分析时，可使用此函数。

p1<-DimPlot(pbmc, reduction = 'umap', group.by ="seurat_clusters",label = TRUE, pt.size = 0.5) + NoLegend()
p2<-FeaturePlot(pbmc, features = c("CD4"))
p3<-DotPlot(sce, group.by = 'seurat_clusters',
           features = unique(genes_to_check)) + RotatedAxis()

p1+p2+p3

比如我们想把某几个亚群提取出来

sce=pbmc
#R 中，提取子集, 需要知道逻辑值，坐标及名字
# seurat 取0，2分群的子集
cd4_sce1 = sce[,sce@meta.data$seurat_clusters %in% c(0,2)]
#取Naive CD4 T ,  Memory CD4 T 
cd4_sce2 = sce[, Idents(sce) %in% c( "Naive CD4 T" ,  "Memory CD4 T" )]

取子集后就是新的项目，需要重新标准化，重新用子集走一遍seurat标准流程

3 如何分群是合理的

我们知道FindClusters函数中不同的resolution参数会带来不同的结果，而且即使某个亚群内部的细胞也会有一定的异质性，那么到底分群多少是合适的呢？

#先执行不同resolution 下的分群
library(Seurat)
library(clustree)
sce <- FindClusters(
  object = sce,
  resolution = c(seq(from=.1,to=1.6,by=.2))
) 
#clustree创建一个聚类树图，依赖于clustree包，显示不同分辨率的聚类之间的关系。prefix指示包含聚类信息的列的字符串
colnames(sce@meta.data)
pz<-clustree(sce@meta.data, prefix = "RNA_snn_res.")

p2=DimPlot(sce, reduction = 'umap', group.by = 'RNA_snn_res.0.5',
           label = TRUE, pt.size = 0.5) + NoLegend()
p3=DimPlot(sce, reduction = 'umap',group.by = 'RNA_snn_res.1.5',
           label = TRUE, pt.size = 0.5) + NoLegend()
library(patchwork)
p1+p2

根据clustree可确定合适的分群数，若resolution设置过大，会导致分群过度，把原来分群的中应有的异质性也提炼出来单独作为一群。无论如何分群，都应该结合FindMarkers函数，确认亚群标志基因，结合生物学背景来解释分群结果。

4 细胞亚群等比例抽样

当数据特别大时，可以采用等比例抽样的方式，降低运算时间。

sce=pbmc

p1<-DoHeatmap(sce , 
          features = features, 
          size = 3
          )
table(Idents(sce))
#使用subset函数，每个亚群抽取15个细胞，做热图
p2<-DoHeatmap(subset(sce, downsample = 15), 
          features = features, size = 3)+
  labs(title = paste("downsample = 15"))
p1+p2

# 每个细胞亚群抽10 
allCells=names(Idents(sce))
allType = levels(Idents(sce))

choose_Cells = unlist(lapply(allType, function(x){
  cgCells = allCells[Idents(sce)== x ]
  cg=sample(cgCells,10) #sample 从指定的元素中获取指定大小的样本。
  cg
}))
#将抽取出的细胞组成一个新的sce对象
cg_sce = sce[, allCells %in% choose_Cells]
> An object of class Seurat 
> 13714 features across 90 samples within 1 > assay 
> Active assay: RNA (13714 features, 2000 > > variable features)
>  2 dimensional reductions calculated: pca, umap

as.data.frame(table(Idents(cg_sce)))

5 如何查看基因的原始表达量

# 如何看原始表达量 slot:要使用的数据槽，从“raw.data”、“data”或“scale.data”中选择;size 颜色条上方文字大小
#不加slot默认是从之前2000个FindVariableFeatures基因作图
DoHeatmap(subset(sce ), 
          features = features, 
          size = 3, slot='data'
)

6 多个亚群的多个标记基因可视化

#首先需要将各个亚群的标记基因找出来
sce.markers <- FindAllMarkers(object = sce, 
               only.pos = TRUE,min.pct = 0.25, thresh.use = 0.25)

#此时可以创建一个HTML小部件来显示矩形数据
DT::datatable(sce.markers)

可以以交互形式的HTML格式查看各亚群的标记基因

#挑选各个亚群差异基因的前5名
top5 <- sce.markers %>% group_by(cluster) %>% top_n(5,avg_log2FC)

head(top5)
#检测并去除重复值 ！表示“与、或、非”中的“非”的意思
top5=top5[!duplicated(top5$gene),] 
select_genes_all=split(top5$gene,top5$cluster)#将表拆分成列表
select_genes_all
DotPlot(object = sce, features=select_genes_all, 
        assay = "RNA") + theme(axis.text.x = element_text(angle= 45,  vjust = 0.5, hjust=0.5))

不同亚群多个差异基因的可视化

7 对任意细胞亚群做差异分析

#若没有运行过FindAllMarkers，运行FindAllMarkers函数，查找各亚群标记基因
sce.markers <- FindAllMarkers(object = sce, 
               only.pos = TRUE,min.pct = 0.25, thresh.use = 0.25)

#table使用交叉分类的因素建立一个列联表,计数在每个组合的因素水平。
table(sce.markers$cluster)

Naive CD4 T   CD14+ Mono Memory CD4 T            B        CD8 T 
         164          391          178          148          144 
FCGR3A+ Mono           NK           DC     Platelet 
         608          369          633          242

挑选亚群和对应基因

# 挑选细胞，若挑选多个亚群细胞，可采用正则表达式选择，比如挑选 Mono和T细胞：grepl(("Mono|T") , Idents(sce ))，这里只挑选Mono类亚群
kp=grepl(("Mono") , Idents(sce ))   
table(kp)
sce=sce[,kp]
sce
table( Idents(sce ))

# 挑选对应的标记基因
kp=grepl('Mono',sce.markers$cluster)#返回匹配与否的逻辑值
table(kp)
cg_sce.markers = sce.markers [ kp ,]
#这里认为vg_log2FC>2的才是标记基因
cg_sce.markers=cg_sce.markers[cg_sce.markers$avg_log2FC>2,]

查找两群的差异基因

markers_df <- FindMarkers(object = sce, 
                 ident.1 = 'FCGR3A+ Mono',#ident 聚类名
                ident.2 = 'CD14+ Mono',
                 #logfc.threshold = 0,
                  min.pct = 0.25)
                  
head(markers_df)

筛选差异基因

#选取avg_log2FC绝对值大于1基因
cg_markers_df=markers_df[abs(markers_df$avg_log2FC) >1,]

dim(cg_markers_df)

DoHeatmap(subset(sce, downsample = 50),
          slot = 'counts',
          unique(rownames(cg_markers_df)),size=3) 
#取两个基因集的交集（基因在某个群中即高表达，又有差异）
intersect(rownames(cg_markers_df) ,
          cg_sce.markers$gene)

8 人为划分亚群挑选差异基因

当我们对表达某类基因的细胞感兴趣时，可以把表达这类基因的的细胞挑选出来，单独划分为一个群。

highCells= colnames(subset(x = sce, subset = FCGR3A > 1,slot = 'counts')) 

#a %in% table，a值是否包含于table中，为真输出TURE，否者输出FALSE
highORlow=ifelse(colnames(sce) %in% highCells,'high','low')

table(highORlow)
table(Idents(sce))
table(Idents(sce),highORlow)

#将highORlow添加到meta.data中
sce@meta.data$highORlow<-highORlow

#查找划分出的亚群与其他亚群的差异基因
markers <- FindMarkers(sce, ident.1 = "high", group.by = 'highORlow' )
head(markers)

cg_markers=markers[abs(markers$avg_log2FC) >1,]

dim(cg_markers)

DoHeatmap(subset(sce, downsample = 15),
          rownames(cg_markers) ,size=3 )

9 查看任意文献的单细胞亚群标记基因

可以直接使用DotPlot函数以文献中标记基因作图

#若想查看下述marker_genes在T细胞中的表达，首先需要将T细胞相关亚群提取出来

sce=pbmc
table(Idents(sce))
#提取单细胞相关亚群
t_sce = sce[, Idents(sce) %in% c("Naive CD4 T" ,  "Memory CD4 T" , "CD8 T","NK")]  

# 然后进行标准的降维聚类分群,代码不要变动
sce=t_sce
sce <- NormalizeData(sce, normalization.method = "LogNormalize",
                     scale.factor = 1e4) 
sce <- FindVariableFeatures(sce, selection.method = 'vst',
                            nfeatures = 2000)
sce <- ScaleData(sce, vars.to.regress = "percent.mt")
sce <- RunPCA(sce, features = VariableFeatures(object = sce)) 
sce <- FindNeighbors(sce, dims = 1:10)
sce <- FindClusters(sce, resolution = 1 )
head(Idents(sce), 5)
table(sce$seurat_clusters) 
sce <- RunUMAP(sce, dims = 1:10)
DimPlot(sce, reduction = 'umap')

查看文献中提到的基因

marker_genes=c("LEF1","TCF7","SELL","IL7R","CD40LG","ANXA1","FOS","JUN","FOXP3","SAT1","IL2RA","CTLA4","PDCD1","CXCL13","CD200","TNFRSF18","CCR7","NELL2","CD55","KLF2","TOB1","ZNF683","CCL5","GZMK","EOMES","ITM2C","CX3CR1","GNLY","GZMH","GZMB","LAG3","CCL4L2","FCGR3A","FGFBP2","TYROBP","AREG","XCL1","KLRC1","TRDV2","TRGV9","MTRNR2L8","KLRD1","TRDV1","KLRC3","CTSW","CD7","MKI67","STMN1","TUBA1B","HIST1H4C" )

DotPlot(sce, features = marker_genes,
             assay='RNA'  )  + coord_flip() +
  theme(axis.text.x = element_text(angle = 90))

10 多个单细胞亚群各自标记基因联合进行GO及kegg数据库注释

#加载需要的包
library(Seurat)
library(gplots)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)

#加载上述cg_sce.markers
pro='all'
load('sce.markers.all_10_celltype.Rdata')
table(sce.markers$cluster)

#基因ID转换bitr(geneID, fromType, toType, OrgDb, drop = TRUE)
ids=bitr(sce.markers$gene,'SYMBOL','ENTREZID','org.Hs.eg.db')

#合并数据
sce.markers=merge(sce.markers,ids,by.x='gene',by.y='SYMBOL')

#拆分id和分群，将ENTREZID拆分成cluster定义的组 
gcSample=split(sce.markers$ENTREZID, sce.markers$cluster)

#比较亚群功能
xx <- compareCluster(gcSample, fun="enrichKEGG",organism="hsa", pvalueCutoff=0.05)
p=dotplot(xx) 
p+ theme(axis.text.x = element_text(angle = 45,  vjust = 0.5, hjust=0.5))
p

对相关基因进行富集分析

#将上升下降的差异基因挑选出来
deg=FindMarkers(object = sce, ident.1 = 'FCGR3A+ Mono',ident.2 = 'CD14+ Mono', #logfc.threshold = 0,min.pct = 0.25)

gene_up=rownames(deg[deg$avg_log2FC>0,])
gene_down=rownames(deg[deg$avg_log2FC < 0,])

library(org.Hs.eg.db)
#把SYMBOL改为ENTREZID，这里会损失一部分无法匹配到的
gene_up=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db, keys = gene_up,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))

gene_down=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db,keys = gene_down,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))

library(clusterProfiler)

这里用到了生信技能树-健明老师的一键分析代码，感谢健明老师的无私分享

## KEGG pathway analysis
### 做KEGG数据集超几何分布检验分析，重点在结果的可视化及生物学意义的理解。
run_kegg <- function(gene_up,gene_down,pro='test'){
  library(ggplot2)
  gene_up=unique(gene_up)
  gene_down=unique(gene_down)
  gene_diff=unique(c(gene_up,gene_down))
  ###   over-representation test
  # 下面把3个基因集分开做超几何分布检验
  # 首先是上调基因集。
  kk.up <- enrichKEGG(gene         = gene_up,
                      organism     = 'hsa',
                      #universe     = gene_all,
                      pvalueCutoff = 0.9,
                      qvalueCutoff =0.9)
  head(kk.up)[,1:6]
  kk=kk.up
  dotplot(kk)
  kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
  write.csv(kk@result,paste0(pro,'_kk.up.csv'))
  
  # 首先是下调基因集。
  kk.down <- enrichKEGG(gene         =  gene_down,
                        organism     = 'hsa',
                        #universe     = gene_all,
                        pvalueCutoff = 0.9,
                        qvalueCutoff =0.9)
  head(kk.down)[,1:6]
  kk=kk.down
  dotplot(kk)
  kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
  write.csv(kk@result,paste0(pro,'_kk.down.csv'))
  
  # 最后是上下调合并后的基因集。
  kk.diff <- enrichKEGG(gene         = gene_diff,
                        organism     = 'hsa',
                        pvalueCutoff = 0.05)
  head(kk.diff)[,1:6]
  kk=kk.diff
  dotplot(kk)
  kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
  write.csv(kk@result,paste0(pro,'_kk.diff.csv'))
  
  
  kegg_diff_dt <- as.data.frame(kk.diff)
  kegg_down_dt <- as.data.frame(kk.down)
  kegg_up_dt <- as.data.frame(kk.up)
  down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.01,];down_kegg$group=-1
  up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.01,];up_kegg$group=1
  #画图设置, 这个图很丑，大家可以自行修改。
  g_kegg=kegg_plot(up_kegg,down_kegg)
  print(g_kegg)
  
  ggsave(g_kegg,filename = paste0(pro,'_kegg_up_down.png') )
  
  if(F){
    ###  GSEA 
    ## GSEA算法跟上面的使用差异基因集做超几何分布检验不一样。
    kk_gse <- gseKEGG(geneList     = geneList,
                      organism     = 'hsa',
                      nPerm        = 1000,
                      minGSSize    = 20,
                      pvalueCutoff = 0.9,
                      verbose      = FALSE)
    head(kk_gse)[,1:6]
    gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
    gseaplot(kk_gse, 'hsa04110',title = 'Cell cycle') 
    kk=DOSE::setReadable(kk_gse, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
    tmp=kk@result
    write.csv(kk@result,paste0(pro,'_kegg.gsea.csv'))
    
    
    # 这里找不到显著下调的通路，可以选择调整阈值，或者其它。
    down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
    up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
    
    g_kegg=kegg_plot(up_kegg,down_kegg)
    print(g_kegg)
    ggsave(g_kegg,filename = paste0(pro,'_kegg_gsea.png'))
    # 
  }
  
}

### GO database analysis 
### 做GO数据集超几何分布检验分析，重点在结果的可视化及生物学意义的理解。
run_go <- function(gene_up,gene_down,pro='test'){
  gene_up=unique(gene_up)
  gene_down=unique(gene_down)
  gene_diff=unique(c(gene_up,gene_down))
  g_list=list(gene_up=gene_up,
              gene_down=gene_down,
              gene_diff=gene_diff)
  # 因为go数据库非常多基因集，所以运行速度会很慢。
  if(T){
    go_enrich_results <- lapply( g_list , function(gene) {
      lapply( c('BP','MF','CC') , function(ont) {
        cat(paste('Now process ',ont ))
        ego <- enrichGO(gene          = gene,
                        #universe      = gene_all,
                        OrgDb         = org.Hs.eg.db,
                        ont           = ont ,
                        pAdjustMethod = "BH",
                        pvalueCutoff  = 0.99,
                        qvalueCutoff  = 0.99,
                        readable      = TRUE)
        
        print( head(ego) )
        return(ego)
      })
    })
    save(go_enrich_results,file =paste0(pro, '_go_enrich_results.Rdata'))
    
  }
  # 只有第一次需要运行，就保存结果哈，下次需要探索结果，就载入即可。
  load(file=paste0(pro, '_go_enrich_results.Rdata'))
  
  n1= c('gene_up','gene_down','gene_diff')
  n2= c('BP','MF','CC') 
  for (i in 1:3){
    for (j in 1:3){
      fn=paste0(pro, '_dotplot_',n1[i],'_',n2[j],'.png')
      cat(paste0(fn,'\n'))
      png(fn,res=150,width = 1080)
      print( dotplot(go_enrich_results[[i]][[j]] ))
      dev.off()
    }
  }
  
  
}

kegg_plot <- function(up_kegg,down_kegg){
  dat=rbind(up_kegg,down_kegg)
  colnames(dat)
  dat$pvalue = -log10(dat$pvalue)
  dat$pvalue=dat$pvalue*dat$group 
  
  dat=dat[order(dat$pvalue,decreasing = F),]
  
  g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 
    geom_bar(stat="identity") + 
    scale_fill_gradient(low="blue",high="red",guide = FALSE) + 
    scale_x_discrete(name ="Pathway names") +
    scale_y_continuous(name ="log10P-value") +
    coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
    ggtitle("Pathway Enrichment") 
}

运行以上代码后，就可以使用run_kegg，run_go和kegg_plot函数了

#直接利用脚本中的代码对正、负相关基因进行kegg富集分析
run_kegg(gene_up,gene_down,pro='test')

# 下面的 run_go 会比较慢，根据需要
# run_go(gene_up,gene_down,pro='shRNA')

#对正相关基因进行富集分析
go <- enrichGO(gene_up, OrgDb = "org.Hs.eg.db", ont="all") 
library(ggplot2)
library(stringr)
p1<-barplot(go, split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free")
go=DOSE::setReadable(go, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv( go@result,file = 'gene_up_GO_all_barplot.csv')

#对负相关基因进行富集分析
go <- enrichGO(gene_down, OrgDb = "org.Hs.eg.db", ont="all") 
p2<-barplot(go, split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free") 
 
go=DOSE::setReadable(go, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
write.csv( go@result,file = 'gene_down_GO_all_barplot.csv')

p1+p2

参考来源
公众号：生信技能树
#section 3已更新#「生信技能树」单细胞公开课2021_哔哩哔哩_bilibili

致谢
I thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.

THE END