单细胞数据挖掘实战:文献复现(二)批量创建Seurat对象及质控
找标记基因这一步其实相当于就是做基因差异分析,这里主要尝试复现文献中的Fig. 2
由于Fig. 1是挑八个样本中的四个进行分析,而从Fig. 2开始就需要全部八个样本的数据了,因此需要重新再走一遍流程,正好也复习一下前面的分析过程。
一、加载R包
if(T){
if(!require(BiocManager))install.packages("BiocManager")
if(!require(Seurat))install.packages("Seurat")
if(!require(Matrix))install.packages("Matrix")
if(!require(ggplot2))install.packages("ggplot2")
if(!require(cowplot))install.packages("cowplot")
if(!require(magrittr))install.packages("magrittr")
if(!require(dplyr))install.packages("dplyr")
if(!require(purrr))install.packages("purrr")
if(!require(ggrepel))install.packages("ggrepel")
if(!require(ggpubr))install.packages("ggpubr")
}
二、读取数据
samples <- dir(path="./GSE136001_RAW/", pattern="^GSM")
for (i in seq_along(samples)){
assign(paste0("samples_raw_data", i), Read10X(data.dir = paste0("./GSE136001_RAW/", samples[i])))
}
三、创建seurat对象
for (i in seq_along(samples)){
assign(paste0("samples_object", i), CreateSeuratObject(counts = eval(parse(text = paste0("samples_raw_data", i))), project = samples[i], min.cells = 5))
}
### merge
samples_objects <- merge(samples_object1, y = c( samples_object2,samples_object3,
samples_object4,samples_object5,
samples_object6,samples_object7,
samples_object8),
add.cell.ids = samples, project = "GBM")
samples_objects
#An object of class Seurat
#14618 features across 41059 samples within 1 assay
#Active assay: RNA (14618 features, 0 variable features)
四、质控
samples_objects <- PercentageFeatureSet(samples_objects, pattern = "^mt-",col.name = "percent_mito")
samples_objects <- subset(samples_objects,subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent_mito < 5)
samples_objects
#An object of class Seurat
#14618 features across 40401 samples within 1 assay
#Active assay: RNA (14618 features, 0 variable features)
1.png
过滤后结果与文章一致。
五、降维、聚类
pca_dims <- 30
umap_n_neighbors <- 30
clustering_resolution <- 0.3
s_genes <- cc.genes$s.genes
g2m_genes <- cc.genes$g2m.genes
samples_objects <- NormalizeData(object = samples_objects, verbose = FALSE)
samples_objects <- FindVariableFeatures(object = samples_objects,
selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
samples_objects@assays$RNA@var.features <- unique(c(samples_objects@assays$RNA@var.features,
s_genes, g2m_genes))
#细胞周期评分
samples_objects <- CellCycleScoring(object = samples_objects, s.features = s_genes, g2m.features = g2m_genes, set.ident = TRUE)
#"Scale Data:", " regress out: nCount_RNA, percent_mito, CC_Difference"
samples_objects$CC_Difference <- samples_objects$S.Score - samples_objects$G2M.Score
seu_object <- ScaleData(object = samples_objects, verbose = FALSE,
vars.to.regress = c("nCount_RNA",
"percent_mito",
"CC_Difference"))
#Run PCA
seu_object <- RunPCA(object = seu_object, npcs = pca_dims, verbose = FALSE)
#Run UMAP
seu_object <- RunUMAP(object = seu_object, reduction = "pca",
dims=1:pca_dims, n.neighbors = umap_n_neighbors, min.dist = 0.5)
#"Find Neighbors:"
seu_object <- FindNeighbors(object = seu_object, dims = 1:pca_dims)
#"Find Clusters:" resolution = 0.6
seu_object <- FindClusters(object = seu_object, resolution = 2*clustering_resolution)
#保存
saveRDS(seu_object, file = "seu_object_merge.RDS")
DimPlot(seu_object,label = T)
2.png
六、细胞注释
gene_marker <- c("Cd14", "Tmem119", "P2ry12", "Csf1", "Crybb1", "Mcm5", "Ifit3", "Tgfbi",
"Ifitm2", "Ifitm3", "S100a6", "Ly6c2", "Ccr2", "Mrc1", "Cd163", "Cd24a",
"Ncam1","Cd14", "Tmem119", "P2ry12", "Tgfbi", "Ifitm2", "Ifitm3", "S100a6", "Ly6c2",
"Ccr2", "Mrc1", "Cd163", "Cd24a", "Ncam1", "Klrk1", "Ncr1", "Cd2", "Cd3d",
"Cd4", "Cd8b1", "Ms4a1")
gene_marker <- unique(gene_marker)
#dotplot
p <- DotPlot(seu_object, features = gene_marker,
assay='RNA' ) + coord_flip()
p
3.png
根据marker基因在Clusters中的表达情况进行注释,并整理成一个anno_cell.csv表格,部分截图如下:
4.png
anno_cell <- read.csv("./anno_cell/anno_cell.csv",header = T)
anno_cells <- anno_cell$cell_type
names(anno_cells) <- levels(seu_object)
seu_object <- RenameIdents(seu_object,anno_cells)
seu_object@meta.data$Main_cell_type <- Idents(seu_object)
DimPlot(seu_object, label = T, label.size = 5,group.by="Main_cell_type" ,
)
5.png
七、提取"MG","Mo/MΦ","BAM"三个亚群
Idents(seu_object) <- seu_object@meta.data$Main_cell_type
seu_object <- subset(seu_object, idents = c("MG","Mo/MΦ","BAM"))
table(seu_object@meta.data$Main_cell_type)
# MG Mo/MΦ BAM T-cells NK B-cells other
# 31959 5624 1680 0 0 0 0
saveRDS(seu_object, file = "seu_object_merge_prediff.RDS")
八、标记基因
#数据标准化
seu_object <- ScaleData(seu_object)
#看一下前面的结果
GetAssayData(seu_object,slot="data",assay="RNA")[1:4,1:4]
#挑选差异基因(耗内存,8G内存跑不出来,后面换了个16G内存的电脑才跑出结果)
markers_cell_types_3_groups <- FindAllMarkers(object = seu_object,only.pos = TRUE, min.pct = 0.25,
logfc.threshold = 0.25)
head(markers_cell_types_3_groups)
# p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
#P2ry12 0 4.251822 0.985 0.274 0 MG P2ry12
#Hexb 0 3.838659 0.997 0.736 0 MG Hexb
#Tmem119 0 3.689420 0.967 0.185 0 MG Tmem119
#Sparc 0 3.675201 0.965 0.146 0 MG Sparc
#Gpr34 0 3.593376 0.963 0.207 0 MG Gpr34
#Olfml3 0 3.310770 0.956 0.264 0 MG Olfml3
#保存
save(markers_cell_types_3_groups,file = "markers_cell_types_3_groups.Rdata")
九、可视化
fig2a 右边
DimPlot(seu_object, label = F, label.size = 5,group.by="Main_cell_type" ,
cols = c("#52b0e6","#fbc000","#0cd2ae"))
6.png
Figure 2b
library(ComplexHeatmap)
library(circlize)
#定义颜色
col_Micro<-"#53AFE6"
col_Macro<-"#FABF00"
col_BAM<-"#0DD1AD"
get_top10_markers_gnames <- markers_cell_types_3_groups %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
markers_expression_data <- GetAssayData(object = seu_object, slot = "data")
markers_expression_data <- markers_expression_data[match(get_top10_markers_gnames$gene,
rownames(markers_expression_data)),
order(Idents(seu_object))]
markers_expression_data <- t(apply(markers_expression_data, 1, function(x) {
x/(quantile(x, probs=0.999))}))
markers_expression_data[markers_expression_data > 1] <- 1
markers_expression_data[markers_expression_data < 0] <- 0
col_annotations <- data.frame(cell_id = colnames(seu_object),
cluster_id = Idents(seu_object))
rownames(col_annotations) <- col_annotations$cell_id
col_fun <- colorRamp2(c(0, 0.25, 0.5, 0.75, 1), c("#edf8fb", "#b3cde3", "#8c96c6", "#8856a7", "#810f7c"))
ht_list <- Heatmap(markers_expression_data, name = "a",
top_annotation =
HeatmapAnnotation(
Cluster = col_annotations[colnames(markers_expression_data), "cluster_id"],
col = list(Cluster = c("MG" = col_Micro,
"Mo/MΦ" = col_Macro,
"BAM" = col_BAM)),
simple_anno_size = unit(3, "cm"),
annotation_legend_param = list(Cluster = list(direction = "horizontal"),
title_gp = gpar(fontsize = 15),
labels_gp = gpar(fontsize = 15),
legend_height = unit(5, "cm"), ncol = 3)),
cluster_rows = F, cluster_columns = F, show_column_names = F,
row_names_gp = gpar(fontsize = 25), col = col_fun, row_split =
rep(c("A", "B", "C"), each = 10),
heatmap_legend_param = list(title = "Expression", direction = "horizontal",
legend_height = unit(3, "cm"),
title_gp = gpar(fontsize = 15),
labels_gp = gpar(fontsize = 15),
at = c(0, 1), labels = c("Low", "High")),
use_raster = T, raster_device = "png"
)
pdf(("fig2b_heatmap.pdf.pdf"), onefile = FALSE, width = 15, height = 15)
p <- draw(ht_list, merge_legend = T, heatmap_legend_side = "top", annotation_legend_side = "top")
dev.off()
右侧显示的每个亚群的前10个基因与原文一致
7.png
Figure 2c, 2d, 2f
- 均为Feature Plots,只需更改下面代码中gene_to_plot <- "Tmem119"的基因名即可,这里只画Tmem119进行展示
- 2c: Tmem119, P2ry12
- 2d: Lgals3, Ptprc
- 2f: Ccr2, Ly6c2, Ly6i, Tgfbi, Ccl5, Cd274, Ccl22, Itga4
gene_to_plot <- "Tmem119"
FeaturePlot(seu_object, features = annot[annot$Gene.name == gene_to_plot, "Gene.stable.ID"]) +
scale_color_gradientn(colors=viridis::viridis(n=50)) +
ggtitle(gene_to_plot)
8.png
Figure 2g
genes<-c("Ccr2", "Ly6c2","Ly6i","Tgfbi","Ccl5", "Cd274", "Ccl22", "Itga4")
gene_expression_data <- GetAssayData(object = seu_object, slot = "data")
gene_expression_data <- as.data.frame(t(gene_expression_data[genes, ]))
gene_expression_data$cell_types_3_groups <- Idents(seu_object)
gene_expression_data$clusters_0.6 <- seu_object$RNA_snn_res.0.6
gene_expression_data <- gene_expression_data[gene_expression_data$cell_types_3_groups == "Mo/MΦ", ]
gene_expression_data$MoM_subtype <- NULL
gene_expression_data$MoM_subtype[gene_expression_data$clusters_0.6 == 11] <- "Mo"
gene_expression_data$MoM_subtype[gene_expression_data$clusters_0.6 == 4] <- "intMoM"
gene_expression_data$MoM_subtype[gene_expression_data$clusters_0.6 %in% c(8,16,18,19)] <- "M"
gene_expression_data$MoM_subtype <- factor(gene_expression_data$MoM_subtype)
gene_expression_data$MoM_subtype <- factor(gene_expression_data$MoM_subtype,
levels = levels(gene_expression_data$MoM_subtype)[c(2,3,1)])
graphs<-list()
for (i in seq_along(genes)){
names<-c("Ccr2", "Ly6c2", "Ly6i", "Tgfbi",
"Ccl5", "Cd274(PD-L1)", "Ccl22", "Itga4(CD49d)")
plot<- ggplot(gene_expression_data[gene_expression_data$cell_types_3_groups == "Mo/MΦ", ],
aes(x="", fill = MoM_subtype)) +
geom_density(aes_string(x=as.name(genes[i])), trim=T, alpha=0.8, size=0.5)+
scale_fill_manual(values= c( "#FFDB3F", "orange", "#F0627C"))+
labs(title=names[i], x="", y="")+
ylim(0,1)+
theme_classic()+
theme(legend.title = element_blank())
graphs[[i]] <- plot
}
pdf(("fig2g2_Density_Plots.pdf"), onefile = FALSE, width = 10, height = 20)
ggarrange(plotlist=graphs, ncol=2, nrow=4, common.legend = T, legend = "top")
dev.off()
这个图由于需要根据文献中提到的基因自己判断"Mo","intMoM","M"的clusters,所以跟原文有细微差别
9.png
往期单细胞数据挖掘实战: