组织内细胞异质性的基础是细胞转录状态的差异,转录状态的特异性又是由转录因子主导的基因调控网络(GRNs)决定并维持稳定的。因此分析单细胞的GRNs有助于深入挖掘细胞异质性背后的生物学意义,并为疾病的诊断、治疗以及发育分化的研究提供有价值的线索。
然而单细胞转录组数据具有背景噪音高、基因检出率低和表达矩阵稀疏性的特点,给传统统计学和生物信息学方法推断高质量的GRNs带来了挑战。Single-cell regulatory network inference and clustering (SCENIC)是一种专为单细胞数据开发的GRNs算法,它的创新之处在于引入了转录因子motif序列验证统计学方法推断的基因共表达网络,从而识别高可靠性的由转录因子主导的GRNs。SCENIC相关的文章2017年首先发表于nature methods,2020年又将流程整理后发表于nature protocls。
SCENIC分析流程
官方介绍的主要分析有四步:
- GENIE3/GRNBoost:基于共表达情况鉴定每个TF的潜在靶点;
- RcisTarget:基于DNA-motif 分析选择潜在的直接结合靶点;
- AUCell:分析每个细胞的regulons活性;
- 细胞聚类:基于regulons的活性鉴定稳定的细胞状态。
一、 运行 pyscenic
1. loom 文件制备
以pbmc3k为例,降维聚类,输出csv矩阵文件。
library(SeuratData) #加载seurat数据集
#InstallData("pbmc3k") #安装pbmc3k数据
data("pbmc3k")
sce <- pbmc3k.final
library(Seurat)
table(Idents(sce))
p1=DimPlot(sce,label = T)
p1
write.csv(t(as.matrix(sce@assays$RNA@counts)), file = "pbmc_3k.all.csv")
这一步会生成一个70M的pbmc_3k.all.csv文件
接下来需要在Linux操作了。写一个 Python脚本 ( csv2loom.py )把 csv格式的表达量矩阵 转为 .loom 文件。
这一步是在linux下面操作
import os, sys
os.getcwd()
os.listdir(os.getcwd())
import loompy as lp;
import numpy as np;
import scanpy as sc;
x=sc.read_csv("pbmc_3k.all.csv"); ## 曾老师的代码这里是x=sc.read_csv("pbmc_3k.csv");
row_attrs = {"Gene": np.array(x.var_names),};
col_attrs = {"CellID": np.array(x.obs_names)};
lp.create("pbmc_3k.loom",x.X.transpose(),row_attrs,col_attrs);
上面的脚本写了后,就可以 运行 Python脚本 ( csv2loom.py )把 csv格式的表达量矩阵 转为 .loom 文件:
#conda activate pyscenic
python csv2loom.py
这一步会生成一个6.7M的pbmc_3k.loom文件。
2. pyscenic运行
2.1 三大文件下载
但是在这之前需要提前下载好3个重要文件。
文件1:hs_hgnc_tfs.txt,https://github.com/aertslab/pySCENIC/blob/master/resources/hs_hgnc_tfs.txt
文件2: hg19-tss-centered-10kb-7species.mc9nr.feather,https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-tss-centered-10kb-7species.mc9nr.feather
文件3: motifs-v9-nr.hgnc-m0.001-o0.0.tbl,https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl
第1个文件12k,第2个文件1.02G,第三个文件99M,大小一定要正确。
2.2 run_pyscenic.sh脚本编写
# 不同物种的数据库不一样,这里是人类是human
tfs=$dir/TF/TFs_list/hs_hgnc_tfs.txt
feather=$dir/hg19-tss-centered-10kb-7species.mc9nr.feather
tbl=$dir/TF/TFs_annotation_motif/human_TFs/motifs-v9-nr.hgnc-m0.001-o0.0.tbl
# 一定要保证上面的数据库文件完整无误哦
input_loom=pbmc_3k.loom
ls $tfs $feather $tbl
# pyscenic 的3个步骤之 grn
pyscenic grn \
--num_workers 20 \
--output adj.sample.tsv \
--method grnboost2 \
$input_loom \
$tfs
#pyscenic 的3个步骤之 cistarget
pyscenic ctx \
adj.sample.tsv $feather \
--annotations_fname $tbl \
--expression_mtx_fname $input_loom \
--mode "dask_multiprocessing" \
--output reg.csv \
--num_workers 20 \
--mask_dropouts
#pyscenic 的3个步骤之 AUCell
pyscenic aucell \
$input_loom \
reg.csv \
--output out_SCENIC.loom \
--num_workers 20
这一步会得到11M的out_SCENIC.loom文件。
最重要的三个文件如下
11M 3月 15 09:21 out_SCENIC.loom
6.7M 3月 13 20:59 pbmc_3k.loom
14M 3月 15 09:18 reg.csv
70M 3月 13 18:18 pbmc_3k.all.csv
二、初级可视化
1. 依赖包安装
## SCENIC需要一些依赖包,先安装好
BiocManager::install(c("AUCell", "RcisTarget"))
BiocManager::install(c("GENIE3"))
BiocManager::install(c("zoo", "mixtools", "rbokeh"))
BiocManager::install(c("DT", "NMF", "pheatmap", "R2HTML", "Rtsne"))
BiocManager::install(c("doMC", "doRNG"))
devtools::install_github("aertslab/SCopeLoomR", build_vignettes = TRUE)
devtools::install_github("aertslab/SCENIC")
library(SCENIC)
packageVersion("SCENIC")
#[1] ‘1.2.4’
2. 提取 out_SCENIC.loom 信息
##可视化
rm(list=ls())
library(Seurat)
library(SCopeLoomR)
library(AUCell)
library(SCENIC)
library(dplyr)
library(KernSmooth)
library(RColorBrewer)
library(plotly)
library(BiocParallel)
library(grid)
library(ComplexHeatmap)
library(data.table)
library(scRNAseq)
## 提取 out_SCENIC.loom 信息
#inputDir='./outputs/'
#scenicLoomPath=file.path(inputDir,'out_SCENIC.loom')
library(SCENIC)
loom <- open_loom('out_SCENIC.loom')
regulons_incidMat <- get_regulons(loom, column.attr.name="Regulons")
regulons_incidMat[1:4,1:4]
regulons <- regulonsToGeneLists(regulons_incidMat)
names(regulons)
regulonAUC <- get_regulons_AUC(loom,column.attr.name='RegulonsAUC')
rownames(regulonAUC)
regulonAucThresholds <- get_regulon_thresholds(loom)
tail(regulonAucThresholds[order(as.numeric(names(regulonAucThresholds)))])
embeddings <- get_embeddings(loom)
close_loom(loom)
3. 加载SeuratData
然后载入前面的seurat对象,我们这里仅仅是最基础的示例数据,所以直接使用 SeuratData 包即可
library(SeuratData) #加载seurat数据集
data("pbmc3k")
sce <- pbmc3k.final
table(sce$seurat_clusters)
table(Idents(sce))
sce$celltype = Idents(sce)
library(ggplot2)
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A',
'CD19', 'CD79A', 'MS4A1' ,
'IGHG1', 'MZB1', 'SDC1',
'CD68', 'CD163', 'CD14',
'TPSAB1' , 'TPSB2', # mast cells,
'RCVRN','FPR1' , 'ITGAM' ,
'C1QA', 'C1QB', # mac
'S100A9', 'S100A8', 'MMP19',# monocyte
'FCGR3A',
'LAMP3', 'IDO1','IDO2',## DC3
'CD1E','CD1C', # DC2
'KLRB1','NCR1', # NK
'FGF7','MME', 'ACTA2', ## fibo
'DCN', 'LUM', 'GSN' , ## mouse PDAC fibo
'MKI67' , 'TOP2A',
'PECAM1', 'VWF', ## endo
'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
library(stringr)
genes_to_check=str_to_upper(genes_to_check)
genes_to_check
p <- DotPlot(sce , features = unique(genes_to_check),
assay='RNA' ) + coord_flip() + theme(axis.text.x=element_text(angle=45,hjust = 1))
p
ggsave('check_last_markers.pdf',height = 11,width = 11)
DimPlot(sce,reduction = "umap",label=T )
sce$sub_celltype = sce$celltype
DimPlot(sce,reduction = "umap",label=T,group.by = "sub_celltype" )
ggsave('umap-by-sub_celltype.pdf')
Idents(sce) <- sce$sub_celltype
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8)
table(sce@meta.data$RNA_snn_res.0.8)
DimPlot(sce,reduction = "umap",label=T )
ggsave('umap-by-sub_RNA_snn_res.0.8.pdf')
这里的代码仍然是简单的检验一下自己的降维聚类分群是否合理,方便后续合并分析。
4. 四种可视化
现在我们就可以把pyscenic的转录因子分析结果去跟我们的降维聚类分群结合起来进行5种可视化展示。
合并的代码如下所示:
sub_regulonAUC <- regulonAUC[,match(colnames(sce),colnames(regulonAUC))]
dim(sub_regulonAUC)
sce
#确认是否一致
identical(colnames(sub_regulonAUC), colnames(sce))
#[1] TRUE
cellClusters <- data.frame(row.names = colnames(sce),
seurat_clusters = as.character(sce$seurat_clusters))
cellTypes <- data.frame(row.names = colnames(sce),
celltype = sce$sub_celltype)
head(cellTypes)
head(cellClusters)
sub_regulonAUC[1:4,1:4]
save(sub_regulonAUC,cellTypes,cellClusters,sce,
file = 'for_rss_and_visual.Rdata')
这个时候,我们的pbmc3k数据集里面的两千多个细胞都有表达量矩阵,也有转录因子活性打分信息。
B细胞有两个非常出名的转录因子,TCF4(+) 以及NR2C1(+),接下来就可以对这两个进行简单的可视化。
首先,我们需要把这两个转录因子活性信息 添加到降维聚类分群后的的seurat对象里面。
#尴尬的是TCF4(+)我这个数据里面没有,换了个PAX5(+)和REL(+)
regulonsToPlot = c('PAX5(+)','REL(+)')
regulonsToPlot
sce@meta.data = cbind(sce@meta.data ,t(assay(sub_regulonAUC[regulonsToPlot,])))
Idents(sce) <- sce$sub_celltype
table(Idents(sce))
DotPlot(sce, features = unique(regulonsToPlot)) + RotatedAxis()
RidgePlot(sce, features = regulonsToPlot , ncol = 1)
VlnPlot(sce, features = regulonsToPlot,pt.size = 0 )
FeaturePlot(sce, features = regulonsToPlot )
可以看到b细胞有两个非常出名的转录因子,TCF4(+) 以及NR2C1(+),确实是在b细胞比较独特的高。
可以看到效果其实没有前面的DotPlot好
三、各个单细胞亚群特异性激活转录因子及可视化
回顾一下pyscenic的转录因子分析结果
###### step0 加载 各种R包 #####
rm(list=ls())
library(Seurat)
library(SCopeLoomR)
library(AUCell)
library(SCENIC)
library(dplyr)
library(KernSmooth)
library(RColorBrewer)
library(plotly)
library(BiocParallel)
library(grid)
library(ComplexHeatmap)
library(data.table)
library(scRNAseq)
load('for_rss_and_visual.Rdata')
head(cellTypes)
sub_regulonAUC[1:4,1:2]
dim(sub_regulonAUC)
#[1] 220 2638
值得一提的是这个pbmc3k数据集的两千多个细胞,其实就220个转录因子结果(曾老师教程里面是208个)。
1. TF活性均值
看看不同单细胞亚群的转录因子活性平均值
# Split the cells by cluster:
selectedResolution <- "celltype" # select resolution
cellsPerGroup <- split(rownames(cellTypes),
cellTypes[,selectedResolution])
# 去除extened regulons
sub_regulonAUC <- sub_regulonAUC[onlyNonDuplicatedExtended(rownames(sub_regulonAUC)),]
dim(sub_regulonAUC)
#[1] 220 2638 #似乎没啥区别
# Calculate average expression:
regulonActivity_byGroup <- sapply(cellsPerGroup,
function(cells)
rowMeans(getAUC(sub_regulonAUC)[,cells]))
# Scale expression.
# Scale函数是对列进行归一化,所以要把regulonActivity_byGroup转置成细胞为行,基因为列
# 参考:https://www.jianshu.com/p/115d07af3029
regulonActivity_byGroup_Scaled <- t(scale(t(regulonActivity_byGroup),
center = T, scale=T))
# 同一个regulon在不同cluster的scale处理
dim(regulonActivity_byGroup_Scaled)
#[1] 220 9
regulonActivity_byGroup_Scaled=regulonActivity_byGroup_Scaled[]
regulonActivity_byGroup_Scaled=na.omit(regulonActivity_byGroup_Scaled)
2. 热图查看TF分布
pheatmap(regulonActivity_byGroup_Scaled)
可以看到,确实每个单细胞亚群都是有 自己的特异性的激活的转录因子。
3. rss 查看特异TF
不过,SCENIC包自己提供了一个 calcRSS函数,帮助我们来挑选各个单细胞亚群特异性的转录因子,全称是:Calculates the regulon specificity score
参考文章:The RSS was first used by Suo et al. in: Revealing the Critical Regulators of Cell Identity in the Mouse Cell Atlas. Cell Reports (2018). doi: 10.1016/j.celrep.2018.10.045
运行超级简单。
rss <- calcRSS(AUC=getAUC(sub_regulonAUC),
cellAnnotation=cellTypes[colnames(sub_regulonAUC),
selectedResolution])
rss=na.omit(rss)
rssPlot <- plotRSS(rss)
plotly::ggplotly(rssPlot$plot)
PAX5(+)和REL(+)的确在B细胞里面高表达。
4. 其他查看TF方式
library(dplyr)
rss=regulonActivity_byGroup_Scaled
head(rss)
library(dplyr)
df = do.call(rbind,
lapply(1:ncol(rss), function(i){
dat= data.frame(
path = rownames(rss),
cluster = colnames(rss)[i],
sd.1 = rss[,i],
sd.2 = apply(rss[,-i], 1, median)
)
}))
df$fc = df$sd.1 - df$sd.2
top5 <- df %>% group_by(cluster) %>% top_n(5, fc)
rowcn = data.frame(path = top5$cluster)
n = rss[top5$path,]
#rownames(rowcn) = rownames(n)
pheatmap(n,
annotation_row = rowcn,
show_rownames = T)
也可以按照sd计算df的sd.2
或者按照mean计算df的sd.2
在这里似乎median、sd、mean都差不多。
至此, pySCENIC的转录因子分析及数据可视化教程复现结束,
报错:
2023-04-04 00:28:00,057 - distributed.nanny - WARNING - Worker process still alive after 3.9999994277954105 seconds, killing
2023-04-04 00:28:00,076 - distributed.nanny - WARNING - Worker process still alive after 3.9999994277954105 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.9999988555908206 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.9999990463256836 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
2023-04-04 00:28:00,077 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
2023-04-04 00:28:00,081 - distributed.nanny - WARNING - Worker process still alive after 3.9999988555908206 seconds, killing
2023-04-04 00:28:00,092 - distributed.nanny - WARNING - Worker process still alive after 3.999999237060547 seconds, killing
Traceback (most recent call last):
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/bin/pyscenic", line 8, in <module>
sys.exit(main())
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 713, in main
args.func(args)
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 106, in find_adjacencies_command
network = method(
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/arboreto/algo.py", line 39, in grnboost2
return diy(expression_data=expression_data, regressor_type='GBM', regressor_kwargs=SGBM_KWARGS,
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/arboreto/algo.py", line 134, in diy
return client \
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 3215, in compute
result = self.gather(futures)
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 2174, in gather
return self.sync(
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 338, in sync
return sync(
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 405, in sync
raise exc.with_traceback(tb)
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/utils.py", line 378, in f
result = yield future
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/tornado/gen.py", line 762, in run
value = future.result()
File "/Bioinfo/nas1/software/conda/Anaconda3/2023.03/lib/python3.10/site-packages/distributed/client.py", line 2037, in _gather
raise exception.with_traceback(traceback)
distributed.scheduler.KilledWorker: ('ndarray-99aebd9077176e83098d0511c36efe41', <WorkerState 'tcp://127.0.0.1:34970', name: 30, status: closed, memory: 0, processing: 23711>)
把-n
设置小点
参考:
https://www.jianshu.com/p/0a29ecfaf21e
https://www.jianshu.com/p/05d1b2d0d772
https://www.jianshu.com/p/7ab2d6c8f764
https://mp.weixin.qq.com/s/SIfyGzx4fwXPtQsVvvwwMQ
https://mp.weixin.qq.com/s/py4AWdtaNNMPqLzU4loODQ
https://mp.weixin.qq.com/s/yaYSqqvBnK8OlL0ZQkR94Q
可视化举例
做热图主要分为两种,一种是把细胞分组求regulons的活性平均值/中位数,另一种是展示所有细胞regulons的活性平均值。
-
1、细胞分组求regulons的活性平均值/中位数
-
2、所有细胞的活性值
-
3、基于Seurat的气泡图
-
4、基于Seurat的峰图
5、基于Seurat的小提琴图
6、基于Seurat的点图
7、regulon网络调控图
8、RSS得分热图和气泡图
https://mp.weixin.qq.com/s/zaXpaTQ0IwYGgMO3XIGVaQ
https://www.jianshu.com/p/7180828033a7
https://www.jianshu.com/p/1c9937e7996c
https://blog.csdn.net/qq_42090739/article/details/127745764
https://zhuanlan.zhihu.com/p/537069815
https://www.jianshu.com/p/c47332b880aa
https://mp.weixin.qq.com/s/pN4qWdUszuGqr8nOJstn8w
https://mp.weixin.qq.com/s/5y-72Hfy3vaxgLiT9kZ4_Q
https://mp.weixin.qq.com/s/sA5F1y4Awki9hiHl2T1DrA