学习单细胞分析—概念理解（3）

以下内容来自于哈佛大学课程：https://hbctraining.github.io/Intro-to-scRNAseq/schedule/links-to-lessons.html

一般在聚类分群时，都是把要分析的样本混在一起，先总体分析，方便看出每个样本细胞群是否存在明显的不同

If cells cluster by sample, condition, batch, dataset, modality, this integration step can greatly improve the clustering and the downstream analyses.
批次效应（Batch effect）通常指的是实验指标检测中，来源关注的生物学处理效应之外的其他因素导致的样本结果的波动。

典型相关分析（Canonical Correlation analysis）是研究两组变量（每组变量中都可能有多个指标）之间相关关系的一种多元统计方法。它能够揭示出两组变量之间的内在联系。可用于分析多个单细胞数据中，不同样本数据之间的相关性。
具体解释可参考：(2 封私信) 单细胞基础分析之多样本整合篇 - 知乎 https://zhuanlan.zhihu.com/p/562535338

#Integration using CCA( canonical correlation analysis，典型相关分析)
#使用高度可变基因是因为它们最有可能代表那些区分不同细胞类型的基因。

split_seurat <- readRDS("data/split_seurat.rds")
# Select the most variable features to use for integration
#第一步：寻找anchor
integ_features <- SelectIntegrationFeatures(object.list = split_seurat, 
                                            nfeatures = 3000) 
# Prepare the SCT list object for integration
#第二步：过滤不可信的anchor。
split_seurat <- PrepSCTIntegration(object.list = split_seurat, 
                                   anchor.features = integ_features)
# Find best buddies - can take a while to run
#第三步：量化邻居对的可靠程度，采用的是共享邻居的比例。
integ_anchors <- FindIntegrationAnchors(object.list = split_seurat, 
                                        normalization.method = "SCT", 
                                        anchor.features = integ_features)
# Integrate across conditions
#第四步，依据选找到的可靠Anchor，进行数据整合
seurat_integrated <- IntegrateData(anchorset = integ_anchors, 
                                   normalization.method = "SCT")

下一步，降维

主成分分析（PCA）作为最经典的降维方法，能够将高维数据投影到低维空间，同时最大化保留原始数据的变异信息。

在进行PCA分析之前，需要进行数据缩放（ScaleData），目的是将每个基因的表达值转换为标准分数（z-score），确保每个基因对PCA的贡献是均等的，避免高表达基因主导整个分析过程。

##整合后，为了可视化整合的数据，我们可以使用降维技术，如PCA和均匀流形近似和投影(UMAP)。
# Run PCA
seurat_integrated <- RunPCA(object = seurat_integrated)
# Plot PCA
PCAPlot(seurat_integrated,
        split.by = "sample")  

#UMAP是一种随机算法。不同运行会产生不同的结果。设定种子，这避免了每次重新运行我们的代码时创建稍微不同的UMAP。
# Set seed
set.seed(123456)
# Run UMAP
seurat_integrated <- RunUMAP(seurat_integrated, 
                             dims = 1:40,
                 reduction = "pca")
# Plot UMAP                             
DimPlot(seurat_integrated)   
# Plot UMAP split by sample
DimPlot(seurat_integrated,
        split.by = "sample") 

# Save integrated seurat object
saveRDS(seurat_integrated, "results/integrated_seurat.rds")

确定合适的主成分数量是PCA分析中的关键决策。使用过少的主成分可能丢失重要的生物学变异信息，而使用过多的主成分则可能引入噪音并增加计算负担。肘部图（Elbow Plot）是最常用的判断方法。
[单细胞转录组分析系列教程 4] 怎么做单细胞PCA降维分析？ https://mp.weixin.qq.com/s?src=11&timestamp=1756368723&ver=6201&signature=K59zZFYJ-O2Ik-wm27U34jBRRTCJHDULjPfjGACCqJaSoFEoWA0wdT524jdedWmqpbeIyEwN962MGOMr-XGhZ3UHlPc9r1SofbGCuudOF5AuP3Pst*ZS0cmnhf14VUoP&new=1

# Explore heatmap of PCs
DimHeatmap(seurat_integrated, 
           dims = 1:9, 
           cells = 500, 
           balanced = TRUE)
# Printing out the most variable genes driving PCs
print(x = seurat_integrated[["pca"]], 
      dims = 1:10, 
      nfeatures = 5)
# Plot the elbow plot
ElbowPlot(object = seurat_integrated, 
          ndims = 40)
#用最后这一个图会好点，拐点即比较好的PC值

#可以尝试做
#查看方差解释比例。这不仅能帮助我们确定主成分数量，还能了解数据的复杂程度。
# 计算方差解释比例
pca_variance <- (seurat_integrated[["pca"]]@stdev)^2
variance_explained <- pca_variance / sum(pca_variance) * 100
cumulative_variance <- cumsum(variance_explained)

# 图4: 方差解释比例图
variance_df <- data.frame(
  PC = 1:20,
  Individual = variance_explained[1:20],
  Cumulative = cumulative_variance[1:20]
)

p4 <- ggplot(variance_df, aes(x = PC)) +
  geom_col(aes(y = Individual), alpha = 0.7, fill = "#3498DB") +
  geom_line(aes(y = Cumulative), color = "#E74C3C", size = 1.2) +
  geom_point(aes(y = Cumulative), color = "#E74C3C", size = 2) +
  labs(title = "PCA Variance Explained", 
       x = "Principal Components", 
       y = "Variance Explained (%)",
       subtitle = "Bars: Individual variance, Line: Cumulative variance")

细胞分群

使用K-最近邻方法

# Determine the K-nearest neighbor graph
#第一步是构建一个K-最近邻(KNN)图基于PCA空间中的欧氏距离。
seurat_integrated <- FindNeighbors(object = seurat_integrated, 
                                dims = 1:40)
# Determine the clusters for various resolutions    
#以优化标准模块功能为目标，反复对单元进行分组。                            
seurat_integrated <- FindClusters(object = seurat_integrated,
                               resolution = c(0.4, 0.6, 0.8, 1.0, 1.4))

# Explore resolutions
seurat_integrated@meta.data %>% 
        View()
# Assign identity of clusters
Idents(object = seurat_integrated) <- "integrated_snn_res.0.8"
#可视化细胞群，通过降维方式进行绘图
seurat_integrated <- RunUMAP(seurat_integrated, 
                   reduction = "pca", 
                   dims = 1:40)
# Plot the UMAP
DimPlot(seurat_integrated,
        reduction = "umap",
        label = TRUE,
        label.size = 6)

Segregation of clusters by sample
如下绘制:条形图，UMAP图，叠加条形图,VinPlot

# Extract identity and sample information from seurat object to determine the number of cells per cluster per sample
n_cells <- FetchData(seurat_integrated, 
                     vars = c("ident", "sample")) %>%
        dplyr::count(ident, sample)

# Barplot of number of cells per cluster by sample
ggplot(n_cells, aes(x=ident, y=n, fill=sample)) +
    geom_bar(position=position_dodge(), stat="identity") +
    geom_text(aes(label=n), vjust = -.2, position=position_dodge(1))
# UMAP of cells in each cluster by sample
DimPlot(seurat_integrated, 
        label = TRUE, 
        split.by = "sample")  + NoLegend()
# Barplot of proportion of cells in each cluster by sample
ggplot(seurat_integrated@meta.data) +
    geom_bar(aes(x=integrated_snn_res.0.8, fill=sample), position=position_fill()) 
# Vln plot 
VlnPlot(object = seurat_integrated, 
        features = c("HSPH1", "HSPE1", "DNAJB1"))

#看看细胞周期是否影响分群
# Explore whether clusters segregate by cell cycle phase
DimPlot(seurat_integrated,
        label = TRUE, 
        split.by = "Phase")  + NoLegend()
#Segregation of clusters by various sources of uninteresting variation
# Determine metrics to plot present in seurat_integrated@meta.data
metrics <-  c("nUMI", "nGene", "S.Score", "G2M.Score", "mitoRatio")

FeaturePlot(seurat_integrated, 
            reduction = "umap", 
            features = metrics,
            pt.size = 0.4, 
            order = TRUE,
            min.cutoff = 'q10',
            label = TRUE)
#Boxplot of nGene per cluster
ggplot(seurat_integrated@meta.data) +
    geom_boxplot(aes(x=integrated_snn_res.0.8, y=nGene, fill=integrated_snn_res.0.8)) +
    NoLegend()

探索已知的细胞类型标记

有了细胞聚集，我们可以通过寻找已知的标记来探索细胞类型身份。显示标有簇的UMAP图，然后是期望的不同细胞类型。
FeaturePlot（）函数在UMAP可视化的基础上查看基因表达情况，用于检查并确定集群的身份。DotPlot

#UMAP基因（ features）表达图
DefaultAssay(seurat_integrated) <- "RNA"
#features，可以包含不同免疫细胞群的特征基因
FeaturePlot(seurat_integrated, 
            reduction = "umap", 
            features = c("CD14", "LYZ"), 
            order = TRUE,
            min.cutoff = 'q10', 
            label = TRUE)

#点形热图去查看
# List of known celltype markers
markers <- list()
markers[["CD14+ monocytes"]] <- c("CD14", "LYZ")
markers[["FCGR3A+ monocyte"]] <- c("FCGR3A", "MS4A7")
markers[["Macrophages"]] <- c("MARCO", "ITGAM", "ADGRE1")
markers[["Conventional dendritic"]] <- c("FCER1A", "CST3")
markers[["Plasmacytoid dendritic"]] <- c("IL3RA", "GZMB", "SERPINF1", "ITM2C")
# Create dotplot based on RNA expression
DotPlot(seurat_integrated, markers, assay="RNA")

FindAllMarkers()：每个聚类的所有标记的标识，通常评估单个样品组/条件时推荐使用。
FindConservedMarkers()：数据集中有代表不同条件的样本，可选择的是找到保守的标记，先各自计算不同样本的分群特征基因，再统一组合起来评价。

# Find markers for every cluster compared to all remaining cells, report only the positive ones
markers <- FindAllMarkers(object = seurat_integrated, 
                          only.pos = TRUE,
                          logfc.threshold = 0.25)     

DefaultAssay(seurat_integrated) <- "RNA"
# Create function to get conserved markers for any given cluster
get_conserved <- function(cluster){
  FindConservedMarkers(seurat_integrated,
                       ident.1 = cluster,
                       grouping.var = "sample",
                       only.pos = TRUE) %>%
    rownames_to_column(var = "gene") %>%
    left_join(y = unique(annotations[, c("gene_name", "description")]),
               by = c("gene" = "gene_name")) %>%
    cbind(cluster_id = cluster, .)
  }
map_dfr(inputs_to_function, name_of_function)
# Iterate function across desired clusters
conserved_markers <- map_dfr(c(4,0,6,2), get_conserved)
# Extract top 10 markers per cluster
top10 <- conserved_markers %>% 
  mutate(avg_fc = (ctrl_avg_log2FC + stim_avg_log2FC) /2) %>% 
  group_by(cluster_id) %>% 
  top_n(n = 10, 
        wt = avg_fc)

# Visualize top 10 markers per cluster
View(top10)

将群集的身份重新分配给这些细胞类型

# Rename all identities
seurat_integrated <- RenameIdents(object = seurat_integrated, 
                               "0" = "Naive or memory CD4+ T cells",
                               "1" = "CD14+ monocytes",
                               "2" = "Activated T cells",
                               "3" = "CD14+ monocytes",
                               "4" = "Stressed cells / Unknown",
                               "5" = "CD8+ T cells",
                               "6" = "Naive or memory CD4+ T cells",
                               "7" = "B cells",
                               "8" = "NK cells",
                               "9" = "CD8+ T cells",
                               "10" = "FCGR3A+ monocytes",
                               "11" = "B cells",
                               "12" = "NK cells",
                               "13" = "B cells",
                               "14" = "Conventional dendritic cells",
                               "15" = "Megakaryocytes",
                   "16" = "Plasmacytoid dendritic cells")


# Plot the UMAP
DimPlot(object = seurat_integrated, 
        reduction = "umap", 
        label = TRUE,
        label.size = 3,
        repel = TRUE)

已经定义了我们的聚类以及每个聚类的标记，可以进行如下操作

# Add celltype annotation as a column in meta.data 
seurat_subset_labeled$celltype <- Idents(seurat_subset_labeled)

# Compute number of cells per celltype
n_cells <- FetchData(seurat_subset_labeled, 
                     vars = c("celltype", "sample")) %>%
        dplyr::count(celltype, sample)

# Barplot of number of cells per celltype by sample
ggplot(n_cells, aes(x=celltype, y=n, fill=sample)) +
    geom_bar(position=position_dodge(), stat="identity") +
    geom_text(aes(label=n), vjust = -.2, position=position_dodge(1))


# Subset seurat object to just B cells
seurat_b_cells <- subset(seurat_subset_labeled, subset = (celltype == "B cells"))

# Run a wilcox test to compare ctrl vs stim
b_markers <- FindMarkers(seurat_b_cells,
                                    ident.1 = "ctrl",
                                    ident.2 = "stim",
                                    grouping.var = "sample",
                                    only.pos = FALSE,
                                    logfc.threshold = 0.25)
library(EnhancedVolcano)
EnhancedVolcano(b_markers,
    row.names(b_markers),
    x="avg_log2FC",
    y="p_val_adj"
)