#!/usr/bin/env Rscript
# 空间基因表达分布图谱分析
# 输入:质控后数据
# 输出:空间分布图谱、处理后数据
# 加载必要的包
suppressMessages({
library(Seurat)
library(ggplot2)
library(dplyr)
library(patchwork)
library(viridis)
library(cowplot)
library(RColorBrewer)
library(pheatmap)
})
# 读取Snakemake参数
input_qc_data <- snakemake@input[["qc_data"]]
top_genes <- snakemake@params[["top_genes"]]
resolution <- snakemake@params[["resolution"]]
# 输出文件路径
output_spatial_plots <- snakemake@output[["spatial_plots"]]
output_spatial_data <- snakemake@output[["spatial_data"]]
# 创建输出目录
dir.create(dirname(output_spatial_plots), recursive = TRUE, showWarnings = FALSE)
# 读取质控数据
cat("正在读取质控数据...\n")
seurat_obj <- readRDS(input_qc_data)
# 数据标准化
cat("正在进行数据标准化...\n")
seurat_obj <- NormalizeData(seurat_obj, normalization.method = "LogNormalize", scale.factor = 10000)
# 寻找高变基因
cat("正在寻找高变基因...\n")
seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "vst", nfeatures = 2000)
# 获取高变基因
top_variable_genes <- head(VariableFeatures(seurat_obj), top_genes)
# 数据缩放
cat("正在进行数据缩放...\n")
seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))
# 主成分分析
cat("正在进行主成分分析...\n")
seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(seurat_obj), verbose = FALSE)
# 聚类分析
cat("正在进行聚类分析...\n")
seurat_obj <- FindNeighbors(seurat_obj, dims = 1:10)
seurat_obj <- FindClusters(seurat_obj, resolution = resolution)
# UMAP降维
cat("正在进行UMAP降维...\n")
seurat_obj <- RunUMAP(seurat_obj, dims = 1:10)
# 创建图表列表
spatial_plots <- list()
# 1. 高变基因图
spatial_plots[["variable_features"]] <- VariableFeaturePlot(seurat_obj) +
labs(title = "高变基因识别") +
theme_minimal()
# 2. PCA图
spatial_plots[["pca"]] <- DimPlot(seurat_obj, reduction = "pca", dims = c(1, 2)) +
labs(title = "主成分分析") +
theme_minimal()
# 3. PCA贡献图
spatial_plots[["pca_loadings"]] <- VizDimLoadings(seurat_obj, dims = 1:2, reduction = "pca") +
labs(title = "主成分贡献") +
theme_minimal()
# 4. 肘部图
spatial_plots[["elbow"]] <- ElbowPlot(seurat_obj) +
labs(title = "主成分重要性") +
theme_minimal()
# 5. UMAP聚类图
spatial_plots[["umap_clusters"]] <- DimPlot(seurat_obj, reduction = "umap", label = TRUE) +
labs(title = "UMAP聚类结果") +
theme_minimal()
# 6. 聚类统计图
cluster_counts <- table(Idents(seurat_obj))
cluster_df <- data.frame(
Cluster = names(cluster_counts),
Count = as.numeric(cluster_counts)
)
spatial_plots[["cluster_counts"]] <- ggplot(cluster_df, aes(x = Cluster, y = Count, fill = Cluster)) +
geom_bar(stat = "identity") +
labs(title = "聚类细胞数量分布", x = "聚类", y = "细胞数量") +
theme_minimal() +
theme(legend.position = "none")
# 7. 基因表达热图(top基因)
if (length(top_variable_genes) > 0) {
# 计算每个聚类的平均表达
Idents(seurat_obj) <- seurat_obj$seurat_clusters
cluster_avg <- AverageExpression(seurat_obj, features = top_variable_genes, group.by = "seurat_clusters")
cluster_avg_matrix <- as.matrix(cluster_avg$RNA)
# 创建热图
spatial_plots[["heatmap"]] <- pheatmap(
cluster_avg_matrix,
scale = "row",
cluster_rows = TRUE,
cluster_cols = TRUE,
color = colorRampPalette(c("blue", "white", "red"))(100),
main = paste("Top", top_genes, "基因表达热图"),
fontsize = 8,
silent = TRUE
)
}
# 8. 特征基因表达图(如果有空间信息)
# 由于输入是h5文件,可能没有空间坐标信息,这里创建一个模拟的空间图
if (!"spatial" %in% names(seurat_obj@images)) {
cat("注意:没有检测到空间坐标信息,将创建模拟空间图...\n")
# 基于UMAP坐标创建模拟空间图
umap_coords <- Embeddings(seurat_obj, "umap")
spatial_plots[["spatial_clusters"]] <- ggplot(data.frame(
UMAP_1 = umap_coords[, 1],
UMAP_2 = umap_coords[, 2],
Cluster = Idents(seurat_obj)
), aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +
geom_point(size = 0.5) +
labs(title = "空间聚类分布(基于UMAP)", x = "UMAP_1", y = "UMAP_2") +
theme_minimal() +
theme(panel.grid = element_blank())
}
# 9. 特定基因的空间表达图
if (length(top_variable_genes) >= 4) {
selected_genes <- top_variable_genes[1:4]
for (i in 1:length(selected_genes)) {
gene <- selected_genes[i]
if (gene %in% rownames(seurat_obj)) {
spatial_plots[[paste0("gene_", i)]] <- FeaturePlot(seurat_obj, features = gene, reduction = "umap") +
labs(title = paste(gene, "基因表达")) +
theme_minimal()
}
}
}
# 10. 质控指标在聚类中的分布
spatial_plots[["qc_violin"]] <- VlnPlot(seurat_obj,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
pt.size = 0.1, ncol = 3) +
labs(title = "质控指标在聚类中的分布") +
theme_minimal()
# 保存所有图表
pdf(output_spatial_plots, width = 16, height = 12)
# 布局1:基本分析
print(wrap_plots(spatial_plots[["variable_features"]],
spatial_plots[["pca"]],
spatial_plots[["elbow"]],
spatial_plots[["cluster_counts"]],
ncol = 2))
# 布局2:聚类结果
print(wrap_plots(spatial_plots[["umap_clusters"]],
spatial_plots[["spatial_clusters"]],
ncol = 2))
# 布局3:基因表达热图
if (!is.null(spatial_plots[["heatmap"]])) {
print(spatial_plots[["heatmap"]])
}
# 布局4:特定基因表达
if (length(top_variable_genes) >= 4) {
gene_plots <- spatial_plots[grep("^gene_", names(spatial_plots))]
if (length(gene_plots) > 0) {
print(wrap_plots(gene_plots, ncol = 2))
}
}
# 布局5:质控指标分布
print(spatial_plots[["qc_violin"]])
dev.off()
# 保存处理后的数据
saveRDS(seurat_obj, output_spatial_data)
# 输出分析摘要
cat("空间分布图谱分析完成!\n")
cat("聚类数量:", length(unique(Idents(seurat_obj))), "\n")
cat("高变基因数量:", length(VariableFeatures(seurat_obj)), "\n")
cat("分析结果已保存到:", output_spatial_plots, "\n")
cat("处理后数据已保存到:", output_spatial_data, "\n")
# 打印聚类统计信息
cat("\n聚类统计信息:\n")
cluster_stats <- table(Idents(seurat_obj))
for (i in 1:length(cluster_stats)) {
cat("聚类", names(cluster_stats)[i], ":", cluster_stats[i], "个细胞\n")
}