02.SP

#!/usr/bin/env Rscript

# 空间基因表达分布图谱分析

# 输入：质控后数据

# 输出：空间分布图谱、处理后数据

# 加载必要的包

suppressMessages({

library(Seurat)

library(ggplot2)

library(dplyr)

library(patchwork)

library(viridis)

library(cowplot)

library(RColorBrewer)

library(pheatmap)

})

# 读取Snakemake参数

input_qc_data <- snakemake@input[["qc_data"]]

top_genes <- snakemake@params[["top_genes"]]

resolution <- snakemake@params[["resolution"]]

# 输出文件路径

output_spatial_plots <- snakemake@output[["spatial_plots"]]

output_spatial_data <- snakemake@output[["spatial_data"]]

# 创建输出目录

dir.create(dirname(output_spatial_plots), recursive = TRUE, showWarnings = FALSE)

# 读取质控数据

cat("正在读取质控数据...\n")

seurat_obj <- readRDS(input_qc_data)

# 数据标准化

cat("正在进行数据标准化...\n")

seurat_obj <- NormalizeData(seurat_obj, normalization.method = "LogNormalize", scale.factor = 10000)

# 寻找高变基因

cat("正在寻找高变基因...\n")

seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "vst", nfeatures = 2000)

# 获取高变基因

top_variable_genes <- head(VariableFeatures(seurat_obj), top_genes)

# 数据缩放

cat("正在进行数据缩放...\n")

seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))

# 主成分分析

cat("正在进行主成分分析...\n")

seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(seurat_obj), verbose = FALSE)

# 聚类分析

cat("正在进行聚类分析...\n")

seurat_obj <- FindNeighbors(seurat_obj, dims = 1:10)

seurat_obj <- FindClusters(seurat_obj, resolution = resolution)

# UMAP降维

cat("正在进行UMAP降维...\n")

seurat_obj <- RunUMAP(seurat_obj, dims = 1:10)

# 创建图表列表

spatial_plots <- list()

# 1. 高变基因图

spatial_plots[["variable_features"]] <- VariableFeaturePlot(seurat_obj) +

labs(title = "高变基因识别") +

theme_minimal()

# 2. PCA图

spatial_plots[["pca"]] <- DimPlot(seurat_obj, reduction = "pca", dims = c(1, 2)) +

labs(title = "主成分分析") +

theme_minimal()

# 3. PCA贡献图

spatial_plots[["pca_loadings"]] <- VizDimLoadings(seurat_obj, dims = 1:2, reduction = "pca") +

labs(title = "主成分贡献") +

theme_minimal()

# 4. 肘部图

spatial_plots[["elbow"]] <- ElbowPlot(seurat_obj) +

labs(title = "主成分重要性") +

theme_minimal()

# 5. UMAP聚类图

spatial_plots[["umap_clusters"]] <- DimPlot(seurat_obj, reduction = "umap", label = TRUE) +

labs(title = "UMAP聚类结果") +

theme_minimal()

# 6. 聚类统计图

cluster_counts <- table(Idents(seurat_obj))

cluster_df <- data.frame(

Cluster = names(cluster_counts),

Count = as.numeric(cluster_counts)

)

spatial_plots[["cluster_counts"]] <- ggplot(cluster_df, aes(x = Cluster, y = Count, fill = Cluster)) +

geom_bar(stat = "identity") +

labs(title = "聚类细胞数量分布", x = "聚类", y = "细胞数量") +

theme_minimal() +

theme(legend.position = "none")

# 7. 基因表达热图（top基因）

if (length(top_variable_genes) > 0) {

# 计算每个聚类的平均表达

Idents(seurat_obj) <- seurat_obj$seurat_clusters

cluster_avg <- AverageExpression(seurat_obj, features = top_variable_genes, group.by = "seurat_clusters")

cluster_avg_matrix <- as.matrix(cluster_avg$RNA)

# 创建热图

spatial_plots[["heatmap"]] <- pheatmap(

cluster_avg_matrix,

scale = "row",

cluster_rows = TRUE,

cluster_cols = TRUE,

color = colorRampPalette(c("blue", "white", "red"))(100),

main = paste("Top", top_genes, "基因表达热图"),

fontsize = 8,

silent = TRUE

)

}

# 8. 特征基因表达图（如果有空间信息）

# 由于输入是h5文件，可能没有空间坐标信息，这里创建一个模拟的空间图

if (!"spatial" %in% names(seurat_obj@images)) {

cat("注意：没有检测到空间坐标信息，将创建模拟空间图...\n")

# 基于UMAP坐标创建模拟空间图

umap_coords <- Embeddings(seurat_obj, "umap")

spatial_plots[["spatial_clusters"]] <- ggplot(data.frame(

UMAP_1 = umap_coords[, 1],

UMAP_2 = umap_coords[, 2],

Cluster = Idents(seurat_obj)

), aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +

geom_point(size = 0.5) +

labs(title = "空间聚类分布（基于UMAP）", x = "UMAP_1", y = "UMAP_2") +

theme_minimal() +

theme(panel.grid = element_blank())

}

# 9. 特定基因的空间表达图

if (length(top_variable_genes) >= 4) {

selected_genes <- top_variable_genes[1:4]

for (i in 1:length(selected_genes)) {

gene <- selected_genes[i]

if (gene %in% rownames(seurat_obj)) {

spatial_plots[[paste0("gene_", i)]] <- FeaturePlot(seurat_obj, features = gene, reduction = "umap") +

labs(title = paste(gene, "基因表达")) +

theme_minimal()

}

# 10. 质控指标在聚类中的分布

spatial_plots[["qc_violin"]] <- VlnPlot(seurat_obj,

features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),

pt.size = 0.1, ncol = 3) +

labs(title = "质控指标在聚类中的分布") +

theme_minimal()

# 保存所有图表

pdf(output_spatial_plots, width = 16, height = 12)

# 布局1：基本分析

print(wrap_plots(spatial_plots[["variable_features"]],

spatial_plots[["pca"]],

spatial_plots[["elbow"]],

spatial_plots[["cluster_counts"]],

ncol = 2))

# 布局2：聚类结果

print(wrap_plots(spatial_plots[["umap_clusters"]],

spatial_plots[["spatial_clusters"]],

ncol = 2))

# 布局3：基因表达热图

if (!is.null(spatial_plots[["heatmap"]])) {

print(spatial_plots[["heatmap"]])

}

# 布局4：特定基因表达

if (length(top_variable_genes) >= 4) {

gene_plots <- spatial_plots[grep("^gene_", names(spatial_plots))]

if (length(gene_plots) > 0) {

print(wrap_plots(gene_plots, ncol = 2))

}

# 布局5：质控指标分布

print(spatial_plots[["qc_violin"]])

dev.off()

# 保存处理后的数据

saveRDS(seurat_obj, output_spatial_data)

# 输出分析摘要

cat("空间分布图谱分析完成！\n")

cat("聚类数量:", length(unique(Idents(seurat_obj))), "\n")

cat("高变基因数量:", length(VariableFeatures(seurat_obj)), "\n")

cat("分析结果已保存到:", output_spatial_plots, "\n")

cat("处理后数据已保存到:", output_spatial_data, "\n")

# 打印聚类统计信息

cat("\n聚类统计信息:\n")

cluster_stats <- table(Idents(seurat_obj))

for (i in 1:length(cluster_stats)) {

cat("聚类", names(cluster_stats)[i], ":", cluster_stats[i], "个细胞\n")

}

02.SP

02.SP

相关阅读更多精彩内容

友情链接更多精彩内容