单细胞数据挖掘实战：文献复现（六）标记基因及可视化

单细胞数据挖掘实战：文献复现（一）批量读取数据

单细胞数据挖掘实战：文献复现（二）批量创建Seurat对象及质控

单细胞数据挖掘实战：文献复现（三）降维、聚类和细胞注释

单细胞数据挖掘实战：文献复现（四）细胞比例饼图

单细胞数据挖掘实战：文献复现（五）细胞亚群并可视化

找标记基因这一步其实相当于就是做基因差异分析，这里主要尝试复现文献中的Fig. 2

由于Fig. 1是挑八个样本中的四个进行分析，而从Fig. 2开始就需要全部八个样本的数据了，因此需要重新再走一遍流程，正好也复习一下前面的分析过程。

一、加载R包

if(T){
  if(!require(BiocManager))install.packages("BiocManager")
  if(!require(Seurat))install.packages("Seurat")
  if(!require(Matrix))install.packages("Matrix")
  if(!require(ggplot2))install.packages("ggplot2")
  if(!require(cowplot))install.packages("cowplot")
  if(!require(magrittr))install.packages("magrittr")
  if(!require(dplyr))install.packages("dplyr")
  if(!require(purrr))install.packages("purrr")
  if(!require(ggrepel))install.packages("ggrepel")
  if(!require(ggpubr))install.packages("ggpubr")
}

二、读取数据

samples <- dir(path="./GSE136001_RAW/", pattern="^GSM")
for (i in seq_along(samples)){
  assign(paste0("samples_raw_data", i), Read10X(data.dir = paste0("./GSE136001_RAW/", samples[i])))
}

三、创建seurat对象

for (i in seq_along(samples)){
  assign(paste0("samples_object", i), CreateSeuratObject(counts = eval(parse(text = paste0("samples_raw_data", i))), project = samples[i], min.cells = 5))
}

### merge
samples_objects <- merge(samples_object1, y = c( samples_object2,samples_object3,
                                                 samples_object4,samples_object5,
                                                 samples_object6,samples_object7,
                                                 samples_object8),
                         add.cell.ids = samples, project = "GBM")
samples_objects
#An object of class Seurat 
#14618 features across 41059 samples within 1 assay 
#Active assay: RNA (14618 features, 0 variable features)

四、质控

samples_objects <- PercentageFeatureSet(samples_objects, pattern = "^mt-",col.name = "percent_mito")
samples_objects <- subset(samples_objects,subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent_mito < 5)
samples_objects
#An object of class Seurat 
#14618 features across 40401 samples within 1 assay 
#Active assay: RNA (14618 features, 0 variable features)

1.png

过滤后结果与文章一致。

五、降维、聚类

pca_dims <- 30
umap_n_neighbors <- 30
clustering_resolution <- 0.3

s_genes <- cc.genes$s.genes
g2m_genes <- cc.genes$g2m.genes

samples_objects <- NormalizeData(object = samples_objects, verbose = FALSE)
samples_objects <- FindVariableFeatures(object = samples_objects, 
                                        selection.method = "vst", 
                                        nfeatures = 2000, verbose = FALSE)
samples_objects@assays$RNA@var.features <- unique(c(samples_objects@assays$RNA@var.features, 
                                                    s_genes, g2m_genes))
#细胞周期评分
samples_objects <- CellCycleScoring(object = samples_objects, s.features = s_genes, g2m.features = g2m_genes, set.ident = TRUE)  
#"Scale Data:", " regress out: nCount_RNA, percent_mito, CC_Difference"
samples_objects$CC_Difference <- samples_objects$S.Score - samples_objects$G2M.Score
seu_object <- ScaleData(object = samples_objects, verbose = FALSE, 
                        vars.to.regress = c("nCount_RNA", 
                                            "percent_mito",
                                            "CC_Difference"))  

#Run PCA
seu_object <- RunPCA(object = seu_object, npcs = pca_dims, verbose = FALSE)
#Run UMAP
seu_object <- RunUMAP(object = seu_object, reduction = "pca", 
                      dims=1:pca_dims, n.neighbors = umap_n_neighbors, min.dist = 0.5)
#"Find Neighbors:"
seu_object <- FindNeighbors(object = seu_object, dims = 1:pca_dims)
#"Find Clusters:" resolution = 0.6
seu_object <- FindClusters(object = seu_object, resolution = 2*clustering_resolution) 

#保存
saveRDS(seu_object, file = "seu_object_merge.RDS")
DimPlot(seu_object,label = T)

2.png

六、细胞注释

gene_marker <- c("Cd14", "Tmem119", "P2ry12", "Csf1", "Crybb1", "Mcm5", "Ifit3", "Tgfbi", 
                 "Ifitm2", "Ifitm3", "S100a6", "Ly6c2", "Ccr2", "Mrc1", "Cd163", "Cd24a", 
                 "Ncam1","Cd14", "Tmem119", "P2ry12", "Tgfbi", "Ifitm2", "Ifitm3", "S100a6", "Ly6c2",
                 "Ccr2", "Mrc1", "Cd163", "Cd24a", "Ncam1", "Klrk1", "Ncr1", "Cd2", "Cd3d",
                 "Cd4", "Cd8b1", "Ms4a1")
gene_marker <- unique(gene_marker)
#dotplot
p <- DotPlot(seu_object, features = gene_marker,
             assay='RNA'  )  + coord_flip()
p

3.png

根据marker基因在Clusters中的表达情况进行注释，并整理成一个anno_cell.csv表格，部分截图如下：

4.png

anno_cell <- read.csv("./anno_cell/anno_cell.csv",header = T)
anno_cells <- anno_cell$cell_type
names(anno_cells) <- levels(seu_object)
seu_object <- RenameIdents(seu_object,anno_cells)
seu_object@meta.data$Main_cell_type <- Idents(seu_object)
DimPlot(seu_object, label = T, label.size = 5,group.by="Main_cell_type" ,
)

5.png

七、提取"MG","Mo/MΦ","BAM"三个亚群

Idents(seu_object) <- seu_object@meta.data$Main_cell_type
seu_object <- subset(seu_object, idents = c("MG","Mo/MΦ","BAM"))
table(seu_object@meta.data$Main_cell_type)
#     MG   Mo/MΦ     BAM T-cells      NK B-cells   other 
#  31959    5624    1680       0       0       0       0 
saveRDS(seu_object, file = "seu_object_merge_prediff.RDS")

八、标记基因

#数据标准化
seu_object <- ScaleData(seu_object)
#看一下前面的结果
GetAssayData(seu_object,slot="data",assay="RNA")[1:4,1:4]
#挑选差异基因（耗内存，8G内存跑不出来，后面换了个16G内存的电脑才跑出结果）
markers_cell_types_3_groups <- FindAllMarkers(object = seu_object,only.pos = TRUE, min.pct = 0.25,
                                              logfc.threshold = 0.25)
head(markers_cell_types_3_groups)
#        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster    gene
#P2ry12      0   4.251822 0.985 0.274         0      MG  P2ry12
#Hexb        0   3.838659 0.997 0.736         0      MG    Hexb
#Tmem119     0   3.689420 0.967 0.185         0      MG Tmem119
#Sparc       0   3.675201 0.965 0.146         0      MG   Sparc
#Gpr34       0   3.593376 0.963 0.207         0      MG   Gpr34
#Olfml3      0   3.310770 0.956 0.264         0      MG  Olfml3
#保存
save(markers_cell_types_3_groups,file = "markers_cell_types_3_groups.Rdata")

九、可视化

fig2a 右边

DimPlot(seu_object, label = F, label.size = 5,group.by="Main_cell_type" ,
        cols = c("#52b0e6","#fbc000","#0cd2ae"))

6.png

Figure 2b

library(ComplexHeatmap)
library(circlize)
#定义颜色
col_Micro<-"#53AFE6"
col_Macro<-"#FABF00"
col_BAM<-"#0DD1AD"

get_top10_markers_gnames <- markers_cell_types_3_groups %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
markers_expression_data <- GetAssayData(object = seu_object, slot = "data")
markers_expression_data <- markers_expression_data[match(get_top10_markers_gnames$gene,
                                                         rownames(markers_expression_data)),
                                                   order(Idents(seu_object))]
markers_expression_data <- t(apply(markers_expression_data, 1, function(x) {
  x/(quantile(x, probs=0.999))}))

markers_expression_data[markers_expression_data > 1] <- 1  
markers_expression_data[markers_expression_data < 0] <- 0

col_annotations <- data.frame(cell_id = colnames(seu_object),
                              cluster_id = Idents(seu_object))
rownames(col_annotations) <- col_annotations$cell_id
col_fun <- colorRamp2(c(0, 0.25, 0.5, 0.75, 1), c("#edf8fb", "#b3cde3", "#8c96c6", "#8856a7", "#810f7c"))
ht_list <- Heatmap(markers_expression_data, name = "a", 
                   top_annotation = 
                     HeatmapAnnotation(
                       Cluster = col_annotations[colnames(markers_expression_data), "cluster_id"], 
                       col = list(Cluster = c("MG" = col_Micro, 
                                              "Mo/MΦ" = col_Macro, 
                                              "BAM" = col_BAM)),
                       simple_anno_size = unit(3, "cm"),
                       annotation_legend_param = list(Cluster = list(direction = "horizontal"), 
                                                      title_gp = gpar(fontsize = 15), 
                                                      labels_gp = gpar(fontsize = 15), 
                                                      legend_height = unit(5, "cm"), ncol = 3)),
                   cluster_rows = F, cluster_columns = F, show_column_names = F, 
                   row_names_gp = gpar(fontsize = 25), col = col_fun, row_split = 
                     rep(c("A", "B", "C"), each = 10),
                   heatmap_legend_param = list(title = "Expression", direction = "horizontal", 
                                               legend_height = unit(3, "cm"), 
                                               title_gp = gpar(fontsize = 15), 
                                               labels_gp = gpar(fontsize = 15), 
                                               at = c(0, 1), labels = c("Low", "High")), 
                   use_raster = T, raster_device = "png"
)
pdf(("fig2b_heatmap.pdf.pdf"), onefile = FALSE, width = 15, height = 15)
p <- draw(ht_list, merge_legend = T, heatmap_legend_side = "top", annotation_legend_side = "top")
dev.off()

右侧显示的每个亚群的前10个基因与原文一致

7.png

Figure 2c, 2d, 2f

均为Feature Plots，只需更改下面代码中gene_to_plot <- "Tmem119"的基因名即可，这里只画Tmem119进行展示

2c: Tmem119, P2ry12

2d: Lgals3, Ptprc

2f: Ccr2, Ly6c2, Ly6i, Tgfbi, Ccl5, Cd274, Ccl22, Itga4

gene_to_plot <- "Tmem119"
FeaturePlot(seu_object, features = annot[annot$Gene.name == gene_to_plot, "Gene.stable.ID"]) + 
    scale_color_gradientn(colors=viridis::viridis(n=50)) +
    ggtitle(gene_to_plot)

8.png

Figure 2g

genes<-c("Ccr2", "Ly6c2","Ly6i","Tgfbi","Ccl5", "Cd274", "Ccl22", "Itga4")
gene_expression_data <- GetAssayData(object = seu_object, slot = "data")
gene_expression_data <- as.data.frame(t(gene_expression_data[genes, ]))
gene_expression_data$cell_types_3_groups <- Idents(seu_object)
gene_expression_data$clusters_0.6 <- seu_object$RNA_snn_res.0.6
gene_expression_data <- gene_expression_data[gene_expression_data$cell_types_3_groups == "Mo/MΦ", ]

gene_expression_data$MoM_subtype <- NULL
gene_expression_data$MoM_subtype[gene_expression_data$clusters_0.6 == 11] <- "Mo"
gene_expression_data$MoM_subtype[gene_expression_data$clusters_0.6 == 4] <- "intMoM"
gene_expression_data$MoM_subtype[gene_expression_data$clusters_0.6 %in% c(8,16,18,19)] <- "M"
gene_expression_data$MoM_subtype <- factor(gene_expression_data$MoM_subtype)
gene_expression_data$MoM_subtype <- factor(gene_expression_data$MoM_subtype, 
                                           levels = levels(gene_expression_data$MoM_subtype)[c(2,3,1)])

graphs<-list()
for (i in seq_along(genes)){
  names<-c("Ccr2", "Ly6c2", "Ly6i", "Tgfbi",
           "Ccl5", "Cd274(PD-L1)", "Ccl22", "Itga4(CD49d)")
  plot<- ggplot(gene_expression_data[gene_expression_data$cell_types_3_groups == "Mo/MΦ", ],
                aes(x="", fill = MoM_subtype)) +
    geom_density(aes_string(x=as.name(genes[i])), trim=T, alpha=0.8, size=0.5)+
    scale_fill_manual(values= c( "#FFDB3F", "orange", "#F0627C"))+
    labs(title=names[i], x="", y="")+
    ylim(0,1)+
    theme_classic()+
    theme(legend.title = element_blank())
  graphs[[i]] <- plot
}
pdf(("fig2g2_Density_Plots.pdf"), onefile = FALSE, width = 10, height = 20)
ggarrange(plotlist=graphs, ncol=2, nrow=4, common.legend = T, legend = "top")
dev.off()