02.SP

#!/usr/bin/env Rscript

# 空间基因表达分布图谱分析

# 输入:质控后数据

# 输出:空间分布图谱、处理后数据

# 加载必要的包

suppressMessages({

  library(Seurat)

  library(ggplot2)

  library(dplyr)

  library(patchwork)

  library(viridis)

  library(cowplot)

  library(RColorBrewer)

  library(pheatmap)

})

# 读取Snakemake参数

input_qc_data <- snakemake@input[["qc_data"]]

top_genes <- snakemake@params[["top_genes"]]

resolution <- snakemake@params[["resolution"]]

# 输出文件路径

output_spatial_plots <- snakemake@output[["spatial_plots"]]

output_spatial_data <- snakemake@output[["spatial_data"]]

# 创建输出目录

dir.create(dirname(output_spatial_plots), recursive = TRUE, showWarnings = FALSE)

# 读取质控数据

cat("正在读取质控数据...\n")

seurat_obj <- readRDS(input_qc_data)

# 数据标准化

cat("正在进行数据标准化...\n")

seurat_obj <- NormalizeData(seurat_obj, normalization.method = "LogNormalize", scale.factor = 10000)

# 寻找高变基因

cat("正在寻找高变基因...\n")

seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "vst", nfeatures = 2000)

# 获取高变基因

top_variable_genes <- head(VariableFeatures(seurat_obj), top_genes)

# 数据缩放

cat("正在进行数据缩放...\n")

seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))

# 主成分分析

cat("正在进行主成分分析...\n")

seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(seurat_obj), verbose = FALSE)

# 聚类分析

cat("正在进行聚类分析...\n")

seurat_obj <- FindNeighbors(seurat_obj, dims = 1:10)

seurat_obj <- FindClusters(seurat_obj, resolution = resolution)

# UMAP降维

cat("正在进行UMAP降维...\n")

seurat_obj <- RunUMAP(seurat_obj, dims = 1:10)

# 创建图表列表

spatial_plots <- list()

# 1. 高变基因图

spatial_plots[["variable_features"]] <- VariableFeaturePlot(seurat_obj) +

  labs(title = "高变基因识别") +

  theme_minimal()

# 2. PCA图

spatial_plots[["pca"]] <- DimPlot(seurat_obj, reduction = "pca", dims = c(1, 2)) +

  labs(title = "主成分分析") +

  theme_minimal()

# 3. PCA贡献图

spatial_plots[["pca_loadings"]] <- VizDimLoadings(seurat_obj, dims = 1:2, reduction = "pca") +

  labs(title = "主成分贡献") +

  theme_minimal()

# 4. 肘部图

spatial_plots[["elbow"]] <- ElbowPlot(seurat_obj) +

  labs(title = "主成分重要性") +

  theme_minimal()

# 5. UMAP聚类图

spatial_plots[["umap_clusters"]] <- DimPlot(seurat_obj, reduction = "umap", label = TRUE) +

  labs(title = "UMAP聚类结果") +

  theme_minimal()

# 6. 聚类统计图

cluster_counts <- table(Idents(seurat_obj))

cluster_df <- data.frame(

  Cluster = names(cluster_counts),

  Count = as.numeric(cluster_counts)

)

spatial_plots[["cluster_counts"]] <- ggplot(cluster_df, aes(x = Cluster, y = Count, fill = Cluster)) +

  geom_bar(stat = "identity") +

  labs(title = "聚类细胞数量分布", x = "聚类", y = "细胞数量") +

  theme_minimal() +

  theme(legend.position = "none")

# 7. 基因表达热图(top基因)

if (length(top_variable_genes) > 0) {

  # 计算每个聚类的平均表达

  Idents(seurat_obj) <- seurat_obj$seurat_clusters

  cluster_avg <- AverageExpression(seurat_obj, features = top_variable_genes, group.by = "seurat_clusters")

  cluster_avg_matrix <- as.matrix(cluster_avg$RNA)


  # 创建热图

  spatial_plots[["heatmap"]] <- pheatmap(

    cluster_avg_matrix,

    scale = "row",

    cluster_rows = TRUE,

    cluster_cols = TRUE,

    color = colorRampPalette(c("blue", "white", "red"))(100),

    main = paste("Top", top_genes, "基因表达热图"),

    fontsize = 8,

    silent = TRUE

  )

}

# 8. 特征基因表达图(如果有空间信息)

# 由于输入是h5文件,可能没有空间坐标信息,这里创建一个模拟的空间图

if (!"spatial" %in% names(seurat_obj@images)) {

  cat("注意:没有检测到空间坐标信息,将创建模拟空间图...\n")


  # 基于UMAP坐标创建模拟空间图

  umap_coords <- Embeddings(seurat_obj, "umap")

  spatial_plots[["spatial_clusters"]] <- ggplot(data.frame(

    UMAP_1 = umap_coords[, 1],

    UMAP_2 = umap_coords[, 2],

    Cluster = Idents(seurat_obj)

  ), aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +

    geom_point(size = 0.5) +

    labs(title = "空间聚类分布(基于UMAP)", x = "UMAP_1", y = "UMAP_2") +

    theme_minimal() +

    theme(panel.grid = element_blank())

}

# 9. 特定基因的空间表达图

if (length(top_variable_genes) >= 4) {

  selected_genes <- top_variable_genes[1:4]


  for (i in 1:length(selected_genes)) {

    gene <- selected_genes[i]

    if (gene %in% rownames(seurat_obj)) {

      spatial_plots[[paste0("gene_", i)]] <- FeaturePlot(seurat_obj, features = gene, reduction = "umap") +

        labs(title = paste(gene, "基因表达")) +

        theme_minimal()

    }

  }

}

# 10. 质控指标在聚类中的分布

spatial_plots[["qc_violin"]] <- VlnPlot(seurat_obj,

                                       features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),

                                       pt.size = 0.1, ncol = 3) +

  labs(title = "质控指标在聚类中的分布") +

  theme_minimal()

# 保存所有图表

pdf(output_spatial_plots, width = 16, height = 12)

# 布局1:基本分析

print(wrap_plots(spatial_plots[["variable_features"]],

                 spatial_plots[["pca"]],

                 spatial_plots[["elbow"]],

                 spatial_plots[["cluster_counts"]],

                 ncol = 2))

# 布局2:聚类结果

print(wrap_plots(spatial_plots[["umap_clusters"]],

                 spatial_plots[["spatial_clusters"]],

                 ncol = 2))

# 布局3:基因表达热图

if (!is.null(spatial_plots[["heatmap"]])) {

  print(spatial_plots[["heatmap"]])

}

# 布局4:特定基因表达

if (length(top_variable_genes) >= 4) {

  gene_plots <- spatial_plots[grep("^gene_", names(spatial_plots))]

  if (length(gene_plots) > 0) {

    print(wrap_plots(gene_plots, ncol = 2))

  }

}

# 布局5:质控指标分布

print(spatial_plots[["qc_violin"]])

dev.off()

# 保存处理后的数据

saveRDS(seurat_obj, output_spatial_data)

# 输出分析摘要

cat("空间分布图谱分析完成!\n")

cat("聚类数量:", length(unique(Idents(seurat_obj))), "\n")

cat("高变基因数量:", length(VariableFeatures(seurat_obj)), "\n")

cat("分析结果已保存到:", output_spatial_plots, "\n")

cat("处理后数据已保存到:", output_spatial_data, "\n")

# 打印聚类统计信息

cat("\n聚类统计信息:\n")

cluster_stats <- table(Idents(seurat_obj))

for (i in 1:length(cluster_stats)) {

  cat("聚类", names(cluster_stats)[i], ":", cluster_stats[i], "个细胞\n")

}

©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容