- Single-nucleus transcriptome analysis reveals dysregulation of angiogenic endothelial cells and neuroprotective glia in Alzheimer's disease
- PMID: 32989152 | 2020 Oct 13
- IF : 11.202 | Proc Natl Acad Sci USA(PNAS)
-
文献思路:对AD和Normal来源的前额皮质组织进行snRNA-seq;进行细胞类型注释,然后基于细胞类型的水平,对AD和Normal进行差异分析。
1-3
2-3
3-3
- 图表复现思路
(1)原始数据预处理,读为Seurat对象
(2)质控、过滤
(3)标准化、高变基因、归一化、降维(PCA、UMAP)、分群
(4)细胞类型注释
(5)基于细胞类型的AD/NC差异分析
(6)以Astro为例的细胞再聚类深入分析
提示:因为本次数据集涉及17W~细胞,所以使用的是服务器Rstudio,本地电脑比较困难。
1、读入数据
1.1 下载数据(GSE157827)--上传到服务器--批量改名(Read10X()
)--创建Seurat对象
- 也可以在服务器中,直接使用
wget
命令下载:https://ftp.ncbi.nlm.nih.gov/geo/series/GSE157nnn/GSE157827/suppl/
setwd("~/scAD")
fs=list.files('./GSE157827_RAW/','^GSM')
fs
library(tidyverse)
samples=str_split(fs,'_',simplify = T)[,1]
#批量将文件名改为 Read10X()函数能够识别的名字
lapply(unique(samples),function(x){
# x = unique(samples)[1]
y=fs[grepl(x,fs)]
folder=paste0("GSE157827_RAW/", paste(str_split(y[1],'_',simplify = T)[,1:2], collapse = "_"))
dir.create(folder,recursive = T)
#为每个样本创建子文件夹
file.rename(paste0("GSE157827_RAW/",y[1]),file.path(folder,"barcodes.tsv.gz"))
#重命名文件,并移动到相应的子文件夹里
file.rename(paste0("GSE157827_RAW/",y[2]),file.path(folder,"features.tsv.gz"))
file.rename(paste0("GSE157827_RAW/",y[3]),file.path(folder,"matrix.mtx.gz"))
})
library(Seurat)
samples=list.files("GSE157827_RAW/")
samples
dir <- file.path('./GSE157827_RAW',samples)
names(dir) <- samples
#创建Seurat对象
counts <- Read10X(data.dir = dir)
scRNA = CreateSeuratObject(counts)
dim(scRNA) #查看基因数和细胞总数
#[1] 33538 179392
table(scRNA@meta.data$orig.ident) #查看每个样本的细胞数
head(scRNA@meta.data)
1.2 metadata细节整理
scRNA$orig.ident = stringr::str_split(rownames(scRNA@meta.data), "_",simplify = T)[,2]
# AD、NC
scRNA$group = substr(scRNA$orig.ident,1,2)
head(scRNA@meta.data)
table(scRNA$group)
table(scRNA$orig.ident)
#为了绘图直观,factor因子化,并且设置指定的levels排序
ranks = order(as.numeric(substr(unique(scRNA$orig.ident),3,4)))
scRNA$orig.ident = factor(scRNA$orig.ident, levels = unique(scRNA$orig.ident)[ranks])
table(scRNA$orig.ident, scRNA$group)
2、质控、过滤低质量的细胞、基因
2.1 过滤前的指标可视化
#指标1:nFeature_RNA 表达基因数
feats <- c("nFeature_RNA", "nCount_RNA")
library(patchwork)
p_filt_b_1=VlnPlot(scRNA, features = feats, pt.size = 0, ncol = 2,
group.by = "group") + NoLegend()
p_filt_b_2=VlnPlot(scRNA, features = feats, pt.size = 0, ncol = 1,
group.by = "orig.ident") + NoLegend()
#指标2~3:线粒体、核糖体基因含量
mito_genes=rownames(scRNA)[grep("^MT-", rownames(scRNA))] ;mito_genes
scRNA=PercentageFeatureSet(scRNA, "^MT-", col.name = "percent_mito")
fivenum(scRNA@meta.data$percent_mito)
ribo_genes=rownames(scRNA)[grep("^Rp[sl]", rownames(scRNA),ignore.case = T)];ribo_genes
scRNA=PercentageFeatureSet(scRNA, "^RP[SL]", col.name = "percent_ribo")
fivenum(scRNA@meta.data$percent_ribo)
feats <- c("percent_mito","percent_ribo")
p_filt_b_3 =VlnPlot(scRNA, features = feats, pt.size = 0, ncol = 2,
group.by = "group") + NoLegend()
p_filt_b_4=VlnPlot(scRNA, features = feats, pt.size = 0, ncol = 1,
group.by = "orig.ident") + NoLegend()
(p_filt_b_1 / p_filt_b_2) | (p_filt_b_3 / p_filt_b_4)
2.2 确定过滤细胞、基因的阈值
- 细胞的相关阈值
#细胞-表达基因数-筛掉过高、过低
retained_c_umi_low <- scRNA$nFeature_RNA > 300
retained_c_umi_high <- scRNA$nFeature_RNA < 8000
#细胞-线粒体/核糖体含量-筛掉过低
retained_c_mito <- scRNA$percent_mito < 14
retained_c_ribo <- scRNA$percent_ribo < 3
table(retained_c_mito & retained_c_ribo & retained_c_umi_low & retained_c_umi_high)
# FALSE TRUE
# 9876 169516
table(scRNA$group[retained_c_mito & retained_c_ribo & retained_c_umi_low & retained_c_umi_high])
# AD NC
# 90480 79036
#指标可视化
#devtools::install_github("gaospecial/ggVennDiagram")
veen_eles = list(`Filt_low_umi\n(300)` = umi_low,
`Filt_high_umi\n(8000)` = umi_high,
`Filt_high_mito\n(14%)` = mito,
`Filt_high_ribo\n(3%)` = ribo)
library("ggVennDiagram")
ggVennDiagram(veen_eles, set_size = 4) +
ggplot2::scale_fill_gradient(low="#ffffb2",high = "#f03b20")
值得注意的是:snRNA-seq与scRNA-seq分析过程中最主要的不同便在于线粒体与核糖体比例。因为前者测的仅为核基因数据,所以理论上很少有核糖体、线粒体相关基因,因此需要过滤掉高表达核糖体、线粒体相关基因。
- 基因的相关阈值
feature_rowsum = Matrix::rowSums(scRNA@assays$RNA@counts>0)
head(feature_rowsum)
retained_f_low <- feature_rowsum > ncol(scRNA)*0.005
table(retained_f_low)
# FALSE TRUE
# 16262 17276
rankplot = data.frame(feature_count = sort(feature_rowsum),
gene = names(sort(retained_f_low)),
Rank = 1:length(feature_rowsum))
library(ggbreak)
ggplot(rankplot, aes(x=Rank, y=feature_count)) +
geom_point() + scale_y_break(c(10000,100000)) +
geom_hline(yintercept=ncol(scRNA)*0.005, color="red") +
geom_text(x=10000, y=4000, label="Feature cutoff : ncol(scRNA)*0.5%", size = 5)
- 根据上面的阈值进行过滤
scRNA_filt = scRNA[retained_f_low, retained_c_mito & retained_c_ribo &
retained_c_umi_low & retained_c_umi_high]
dim(scRNA_filt)
#[1] 17276 169516
table(scRNA_filt$group)
# AD NC
# 90480 79036
2.3 过滤后的指标可视化
feats <- c("nFeature_RNA", "nCount_RNA")
p_filt_a_1=VlnPlot(scRNA_filt, features = feats, pt.size = 0, ncol = 2,
group.by = "group") + NoLegend()
p_filt_a_2=VlnPlot(scRNA_filt, features = feats, pt.size = 0, ncol = 1,
group.by = "orig.ident") + NoLegend()
feats <- c("percent_mito","percent_ribo")
p_filt_a_3=VlnPlot(scRNA_filt, features = feats, pt.size = 0, ncol = 2,
group.by = "group") + NoLegend()
p_filt_a_4=VlnPlot(scRNA_filt, features = feats, pt.size = 0, ncol = 1,
group.by = "orig.ident") + NoLegend()
(p_filt_a_1 / p_filt_a_2) | (p_filt_a_3 / p_filt_a_4)
2.4 保存过滤后的数据(optional)
# saveRDS(scRNA_filt, file = "sce_filt.rds")
# scRNA = readRDS("sce_filt.rds")
3、标准化、高变基因、归一化、降维(PCA、UMAP)、分群
3.1 标准化、高变基因、归一化
- 由于数据量比较大,
ScaleData()
函数比较耗时,尤其是加上回归参数。可使用future包的plan()
函数设置线程数,详见https://satijalab.org/seurat/archive/v3.0/future_vignette.html - 推荐使用四个线程,此外如果在
FindIntegrationAnchors()
步骤使用报错,设置options(future.globals.maxSize = 1000 * 1024^2) #1G plan()
即可
library(future)
plan()
plan("multiprocess", workers = 4)
plan()
scRNA <- NormalizeData(scRNA, normalization.method = "LogNormalize", scale.factor = 10000)
scRNA <- FindVariableFeatures(scRNA, selection.method = "vst", nfeatures = 2000)
scRNA <- ScaleData(scRNA, features = VariableFeatures(scRNA),
vars.to.regress = c("nFeature_RNA","percent_mito"))
3.2 降维、分群
scRNA <- RunPCA(scRNA, features = VariableFeatures(scRNA))
scRNA = RunUMAP(scRNA, dims = pc.num)
#根据group,展示dimplot
p_dim = DimPlot(scRNA, group.by = "group")
scRNA <- FindNeighbors(scRNA, dims = pc.num)
scRNA <- FindClusters(scRNA, resolution = c(0.01,0.05,0.1,0.2,0.5))
#不同分辨率的分群效果
p_cutree = clustree(scRNA@meta.data, prefix = "RNA_snn_res.")
Idents(scRNA) = scRNA$RNA_snn_res.0.05
table(scRNA@active.ident)
p_dim_1 = DimPlot(scRNA, label = T)
p_dim | p_cutree | p_dim_1
4、细胞类型注释
4.1 逐一展示每个细胞类型的marker gene dotplot,注释cluster
# astrocytes: AQP4, ADGRV1, GPC5, RYR3
# endothelial cells: CLDN5, ABCB1, EBF1
# excitatory neurons: CAMK2A, CBLN2, LDB2
# inhibitory neurons: GAD1, LHFPL3, PCDH15
# microglia: C3, LRMDA, DOCK8
# oligodendrocytes: MBP, PLP1, ST18
astrocytes = c("AQP4", "ADGRV1", "GPC5", "RYR3")
DotPlot(scRNA, features = astrocytes,
assay = "RNA") # 2
endothelial = c("CLDN5", "ABCB1", "EBF1")
DotPlot(scRNA, features = endothelial,
assay = "RNA") # 10
excitatory = c("CAMK2A", "CBLN2", "LDB2")
DotPlot(scRNA, features = excitatory,
assay = "RNA") # 0,6,8,9,11
inhibitory = c("GAD1", "LHFPL3", "PCDH15")
DotPlot(scRNA, features = inhibitory,
assay = "RNA") # 3,4,5
microglia = c("C3", "LRMDA", "DOCK8")
DotPlot(scRNA, features = microglia,
assay = "RNA") # 7
oligodendrocytes = c("MBP", "PLP1", "ST18")
DotPlot(scRNA, features = oligodendrocytes,
assay = "RNA") # 1
- 合并dotplot
gene_list = list(
Astro = astrocytes,
Endo = endothelial,
Excit = excitatory,
Inhib = inhibitory,
Mic = microglia,
Oligo = oligodendrocytes
)
# 调整cluster的level因子顺序,从而较美观的展示dotplot
scRNA@active.ident = factor(scRNA@active.ident, levels = c(2,10,0,6,8,9,11,3,4,5,7,1))
p_dot_1 = DotPlot(scRNA, features = gene_list,
assay = "RNA") +
theme(axis.text.x = element_text(angle = 60, vjust = 0.8, hjust=0.8))
p_dim_1 | p_dot_1
4.2 根据上述结果,注释细胞类型
scRNA$celltype = ifelse(scRNA@active.ident %in% c(2), "Astro",
ifelse(scRNA@active.ident %in% c(10), "Endo",
ifelse(scRNA@active.ident %in% c(7), "Mic",
ifelse(scRNA@active.ident %in% c(1), "Oligo",
ifelse(scRNA@active.ident %in% c(3,4,5), "Inhib",
"Excit")))))
table(scRNA$celltype)
Idents(scRNA) = scRNA$celltype
scRNA@active.ident = factor(scRNA@active.ident, levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo"))
table(scRNA@active.ident)
p_dot_2= DotPlot(scRNA, features = gene_list,
assay = "RNA") +
theme(axis.text.x = element_text(angle = 60, vjust = 0.8, hjust=0.8))
p_dim_2= DimPlot(scRNA, label = T)
p_dim_2 | p_dot_2
4.3 复现paper1B-D
#Fig 1B
fig_1b=DimPlot(scRNA, cols = c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02'))
#Fig 1C
ct_stat = table(scRNA$celltype)
ct_stat = data.frame(celltype = names(ct_stat),
total = as.numeric(ct_stat),
percentage = ct_stat/sum(ct_stat))
library(ggplot2)
fig_1c=ggplot(ct_stat, aes(x=celltype, fill=celltype, y = percentage.Freq)) +
geom_bar(stat="identity",color = "black") + NoLegend() +
scale_fill_manual(values=c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02')) +
theme_classic() + theme(axis.title.x = element_blank()) +
ylab("Proportion of total nuclei(%)")
#Fig 1D
#Downsample 5000个细胞做Fingallmarker从而节省时间
table(scRNA@active.ident)
CTs = as.character(unique(scRNA@active.ident))
sampled = unlist(lapply(CTs, function(x){
#x = CTs[1]
index = scRNA@active.ident %in% x
cells = names(scRNA@active.ident[index])
if(length(cells) > 5000){
sle_cell = sample(cells, 5000)
} else {
sle_cell = cells
}
}))
scRNA_sample = scRNA[ ,sampled]
dim(scRNA_sample)
table(scRNA_sample@active.ident)
#diff.wilcox = FindAllMarkers(scRNA_sample)
library(tidyverse)
all.markers = diff.wilcox %>% select(gene, everything()) %>%
subset(p_val<0.05 & abs(diff.wilcox$avg_log2FC) > 0.5)
top5 = all.markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
top5 = CaseMatch(search = as.vector(top5$gene), match = rownames(scRNA_sample))
#save(top5, file = "top5.rda")
#load("top5.rda")
fig_1d = DoHeatmap(scRNA_sample, features = top5, angle = 0,
group.colors = c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02'))
#合并
((fig_1b + labs(tag = "B"))+ (fig_1c + labs(tag = "C")) ) / (fig_1d + labs(tag = "D"))
5、基于细胞类型的AD/NC差异分析(Fig 5A-D复现)
#Fig 2a
fig_2a=DimPlot(scRNA, split.by = "group", cols = c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02'))
#Fig 2b
ct_stat2 = as.data.frame(table(scRNA$celltype, scRNA$group))
sums = rep(c(sum(ct_stat2$Freq[1:6]),sum(ct_stat2$Freq[7:12])),each=6)
ct_stat2$percentage = ct_stat2$Freq/sums
ct_stat2$Var2 = factor(ct_stat2$Var2, levels = c("NC","AD"))
library(ggplot2)
fig_2b=ggplot(ct_stat2, aes(x=Var1, fill=Var2, y = percentage)) +
geom_bar( position=position_dodge(width=0.8), width=0.6,
stat="identity",color = "black") +
scale_fill_manual(values=c('white','black')) +
theme_classic() + ylab("Proportion of total nuclei(%)") +
theme(axis.title.x = element_blank(), legend.title = element_blank())
- 每种细胞类型的差异分析
library(future)
plan()
plan("multiprocess", workers = 4)
plan()
head(scRNA@meta.data)
head(scRNA@meta.data[,c("group","celltype")])
scRNA$compare = paste(scRNA$group, scRNA$celltype, sep = "_")
table(scRNA$compare)
# AD_Astro AD_Endo AD_Excit AD_Inhib AD_Mic AD_Oligo
# 10101 1621 32277 19271 4209 23001
# NC_Astro NC_Endo NC_Excit NC_Inhib NC_Mic NC_Oligo
# 8782 598 30050 18021 3714 17871
ct = levels(scRNA@active.ident)
all_markers = lapply(ct, function(x){
# x = ct[1]
print(x)
markers <- FindMarkers(scRNA, group.by = "compare",
logfc.threshold = 0.1,
ident.1 = paste("AD",x,sep = "_"),
ident.2 = paste("NC",x,sep = "_"))
#markers_sig <- subset(markers, p_val_adj < 0.1)
return(markers)
})
length(all_markers)
names(all_markers) = ct
lapply(all_markers,nrow)
- 根据文章提供的阈值,确认差异基因:
compared the individual cell-type between AD and NC samples.
adjusted P < 0.1, log 2 fold change ≥ 0.1 or ≤ −0.1
all_markers_sig = lapply(all_markers, function(x){
markers_sig <- subset(x, p_val_adj < 0.1)
#markers_sig <- subset(x, p_val < 0.0001)
})
marker_stat = as.data.frame(lapply(all_markers_sig,function(x){
# x=all_markers_sig[[1]]
Up = sum(x$avg_log2FC>0)
Down = sum(x$avg_log2FC<0)
Total = length(x$avg_log2FC)
return(c(Up, Down, Total))
}))
rownames(marker_stat) = c("Up","Down","Total")
marker_stat
library(gridExtra)
fig_2c = ggpubr::as_ggplot(tableGrob(marker_stat, theme = ttheme_default(base_size = 25)))
- Fig 2D Veen图
难点:venn.diagram转为ggplot对象,以便于拼图
names(all_markers_sig)
Astro = rownames(all_markers_sig$Astro)
Endo = rownames(all_markers_sig$Endo)
`Excit+Inhib` = unique(c(rownames(all_markers_sig$Excit),
rownames(all_markers_sig$Inhib)))
Mic = rownames(all_markers_sig$Mic)
Oligo = rownames(all_markers_sig$Oligo)
veen_eles2 = list(Astro=Astro, Endo=Endo,
`Excit+Inhib`=`Excit+Inhib`,
Mic=Mic, Oligo=Oligo)
library(VennDiagram)
library(ggplotify)
library(gridGraphics)
plt=venn.diagram(veen_eles2, filename=NULL)
p_tmp <- grobTree(plt)
fig_2d=as.ggplot(plot_grid(p_tmp))
- 合并
(fig_2a + fig_2b)/(fig_2c + fig_2d) + plot_annotation(tag_levels = 'A')
6、以Astro为例的细胞再聚类深入分析(Fig 4A-D)
- Astro细胞类型的clsuter再聚类→分为三大类:
(1) Astro细胞的AD上调基因相对高表达的clusters
(2) Astro细胞的AD下调基因相对高表达的clusters
(3) 其它clusters
对前两大类cluster进行差异分析,以及富集分析(有点绕,可参看原文) - 首先对Astro细胞类型的clsuter,重新分析
library(future)
plan()
plan("multiprocess", workers = 4)
plan()
#subcluster analysis
scRNA_astro = subset(scRNA, celltype == "Astro")
dim(scRNA_astro)
scRNA_astro <- NormalizeData(scRNA_astro, normalization.method = "LogNormalize", scale.factor = 10000)
scRNA_astro <- FindVariableFeatures(scRNA_astro, selection.method = "vst", nfeatures = 2000)
scRNA_astro <- ScaleData(scRNA_astro, features = VariableFeatures(scRNA_astro),
vars.to.regress = c("nFeature_RNA","percent_mito"))
scRNA_astro <- RunPCA(scRNA_astro, features = VariableFeatures(scRNA_astro))
ElbowPlot(scRNA_astro, ndims=30, reduction="pca")
pc.num=1:30
scRNA_astro = RunUMAP(scRNA_astro, dims = pc.num)
DimPlot(scRNA_astro, group.by = "group")
DimPlot(scRNA_astro, split.by = "group")
scRNA_astro <- FindNeighbors(scRNA_astro, dims = pc.num)
scRNA_astro <- FindClusters(scRNA_astro, resolution = c(0.01,0.05,0.1,0.2,0.3,0.4,0.5))
clustree(scRNA_astro@meta.data, prefix = "RNA_snn_res.")
#选择resolution 0.3的分群结果
scRNA_astro$sle_reso = as.numeric(scRNA_astro$RNA_snn_res.0.3)
scRNA_astro$sle_reso = paste0("a",scRNA_astro$sle_reso)
table(scRNA_astro$sle_reso)
scRNA_astro$sle_reso = factor(scRNA_astro$sle_reso, levels = paste0("a",1:12))
Idents(scRNA_astro) = scRNA_astro$sle_reso
table(scRNA_astro@active.ident)
scRNA_astro$group = factor(scRNA_astro$group, levels = c("NC","AD"))
fig_4a=DimPlot(scRNA_astro, split.by = "group")
p_sub_dim = DimPlot(scRNA_astro, label = T)
- 根据上述结果,以及对应paper,分为3大类细胞
scRNA_astro$subpop = ifelse(scRNA_astro@active.ident %in% c("a1","a5"), "Up",
ifelse(scRNA_astro@active.ident %in% c("a4","a3","a11"),
"Down","No change"))
table(scRNA_astro$subpop)
scRNA_astro$subpop = factor(scRNA_astro$subpop, levels = c("Up","Down","No change"))
fig_4b=DimPlot(scRNA_astro, group.by = "subpop", cols = c("#e41a1c","#4daf4a","#d9d9d9"))
- subcluster DEG
marker_Astro <- FindMarkers(scRNA_astro, group.by = "subpop",
ident.1 = "Down",
ident.2 = "Up")
marker_Astro = marker_Astro[order(marker_Astro$avg_log2FC, decreasing = T),]
head(marker_Astro)
scRNA_astro_sub = subset(scRNA_astro, subpop != "No change")
scRNA_astro_sub$subpop = factor(scRNA_astro_sub$subpop, levels = c("Down","Up"))
library(future)
plan()
plan("multiprocess", workers = 4)
plan()
scRNA_astro_sub <- ScaleData(scRNA_astro_sub, features = rownames(marker_Astro),
vars.to.regress = c("nFeature_RNA","percent_mito"))
#Doheatmap结果中,只展示部分感兴趣(与paper重合)的基因注释
paper = c("GLUD1","GLUL","PTN","TNIK","SLC4A4","GABRA2","VEGFA","ETNPPL","SLC1A3",
"TUBB2B","SPARCL1","HES1","WIF1","HES5","APOE","SLC1A2","F3")
table(paper %in% rownames(marker_Astro)[1:100])
genes.to.label = paper[paper %in% rownames(marker_Astro)[1:100]]
labels <- rep(x = "transparent", times = length(x = rownames(marker_Astro)[1:100]))
labels[match(x = genes.to.label, table = rownames(marker_Astro)[1:100])] <- "black"
fig_4c=DoHeatmap(object = scRNA_astro_sub, features = rownames(marker_Astro)[1:100],
group.by = "subpop", group.colors = c("#4daf4a","#e41a1c"), angle = 0) +
theme(axis.text.y = element_text(color = rev(x = labels)))
fig_4c
- 差异基因的富集分析结果
library(clusterProfiler)
library(org.Hs.eg.db)
ego <- enrichGO(gene = rownames(marker_Astro)[1:200],
keyType = "SYMBOL",
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 1,
qvalueCutoff = 1)
res = ego@result
interest_go = which(rownames(ego@result) %in% c("GO:0021953","GO:0050769","GO:0021872","GO:0021879",
"GO:0007409","GO:0042552","GO:0008366","GO:0030900")
interest_go=ego@result$Description[interest_go]
fig_4d=barplot(ego, showCategory=interest_go, cols="pvalue")
- 合并
(fig_4a + fig_4b)/(fig_4c + fig_4d) + plot_layout(widths = c(2,1)) +
plot_annotation(tag_levels = 'A')