作者,Evil Genius
这一篇我们继续合作项目的分析示例,我们在了解了空间转录组的整体分析内容之后,就需要开始针对我们自己的数据进行有效分析,注意是有效分析,就是大家分析出来的结果要有生物学意义,不是随便跑个结果就结束的这种。
首先拿到数据,现在通常都是多个空间数据样本,当然了,低精度还需要单细胞样本,即使是HD,也是推荐用单细胞来解卷积的。
那么我们首先第一步需要考虑的问题,就是空间数据质控的问题,上过课的都应该明白,空间数据质控不能像单细胞质控那样处理,一般只需要结合图像信息即可,其中唯一需要考虑的问题,就是空间污染的问题。
数据矫正之后,我们就需要考虑第二个问题,空间数据要不要整合的问题,整合还是不整合都有文章运用,不过通常来讲整合占大多数,这也是公司层面默认的选择,如果选择不整合,就需要根据形态学区域划分,这个难度更高一点。
我们以其中4个样本为例,实现基础的整合分析。
我们如果选择了整合,那么方法有两种CCA、Harmony,哪种更好呢?没有定论,从结合空间图像的效果来看,harmony大多数情况是符合形态学特征的,当然了,分析的时候两种结果都要拿到做比较分析。
分析的时候要注意,软件版本之间可能不兼容。
library(tidyverse)
library(Seurat)
library(cowplot)
library(RColorBrewer)
library(pheatmap)
library(scales)
library(ggsci)
library(harmony)
source('/home/scRNA/DB/SPATIAL_FUNCTIONS.R')
# Define Color Scheme
custom_colors <- c("#B4DFFC","#6EAB3D","#FFD700","#A020F0","#FFA500","#AEDD3C","#595959","#D2AF81FF","#3A49FC","#FF0000","#A86F3D","#A18FAA")
#读取数据
MGH258 = Load10X_Spatial('/home/jinsai/pipeline/drug_pipeline/DB/Inputs/general/GBM_data/MGH258/outs')
UKF243 = Load10X_Spatial('/home/jinsai/pipeline/drug_pipeline/DB/Inputs/general/GBM_data/UKF243/outs')
UKF248 = Load10X_Spatial('/home/jinsai/pipeline/drug_pipeline/DB/Inputs/general/GBM_data/UKF248/outs')
UKF251 = Load10X_Spatial('/home/jinsai/pipeline/drug_pipeline/DB/Inputs/general/GBM_data/UKF251/outs')
samples = c(MGH258,UKF243,UKF248,UKF251)
####Merging data for Harmony Batch correction(CCA)
combined_CCA <- st_combine(samples, ndim = 20, res = 0.5)
####Harmony整合
combined_harmony <- SCTransform(combined_CCA,assay = "Spatial", vars.to.regress = 'sample')
combined_harmony <- RunPCA(combined_harmony,assay = "SCT")
combined_harmony<- RunHarmony(combined_harmony,assay.use = "SCT",project.dim = FALSE,group.by.vars = 'sample')
这个时候就会拿到基础的整合结果。
下面就是基础的降维聚类差异富集了。
##CCA
combined_CCA <- combined_CCA %>%
RunUMAP(reduction = "CCA", dims = 1:20) %>%
FindNeighbors(reduction = "CCA", dims = 1:20) %>%
FindClusters(resolution = 0.4) %>%
identity()
####Harmony
combined_harmony <- combined_harmony %>%
RunUMAP(reduction = "harmony", dims = 1:20) %>%
FindNeighbors(reduction = "harmony", dims = 1:20) %>%
FindClusters(resolution = 0.4) %>%
identity()
差异富集大家自己做,然后我们空间展示
DimPlot(combined_harmony, cols=color.labels, pt.size=3.5)
结合空间信息进行调整
SpatialDimPlot(combined_harmony,cols=color.labels,pt.size.factor = 2.5)
基础分析是一切的基础,通常在基础分析之上,会加上NMF寻找programs
library (nmf)
m <- as.matrix(combined_harmony)####注意这是抽取矩阵
res <- NMF::nmf(x = m, rank = rank_lb:rank_ub, nrun = 5, method = "snmf/r", .opt = list(debug=F, parallel=F, shared.memory=F, verbose=T))
#robust_nmf_programs function
robust_nmf_programs <- function(nmf_programs, intra_min = 30, intra_max = 10, inter_filter=T, inter_min = 10) {
# Select NMF programs based on the minimum overlap with other NMF programs from the same sample
intra_intersect <- lapply(nmf_programs, function(z) apply(z, 2, function(x) apply(z, 2, function(y) length(intersect(x,y)))))
intra_intersect_max <- lapply(intra_intersect, function(x) apply(x, 2, function(y) sort(y, decreasing = T)[2]))
nmf_sel <- lapply(names(nmf_programs), function(x) nmf_programs[[x]][,intra_intersect_max[[x]]>=intra_min])
names(nmf_sel) <- names(nmf_programs)
# Select NMF programs based on i) the maximum overlap with other NMF programs from the same sample and
# ii) the minimum overlap with programs from another sample
nmf_sel_unlist <- do.call(cbind, nmf_sel)
inter_intersect <- apply(nmf_sel_unlist , 2, function(x) apply(nmf_sel_unlist , 2, function(y) length(intersect(x,y)))) ## calculating intersection between all programs
final_filter <- NULL
for(i in names(nmf_sel)) {
a <- inter_intersect[grep(i, colnames(inter_intersect), invert = T),grep(i, colnames(inter_intersect))]
b <- sort(apply(a, 2, max), decreasing = T) # for each sample, ranks programs based on their maximum overlap with programs of other samples
if(inter_filter==T) b <- b[b>=inter_min] # selects programs with a maximum intersection of at least 10
if(length(b) > 1) {
c <- names(b[1])
for(y in 2:length(b)) {
if(max(inter_intersect[c,names(b[y])]) <= intra_max) c <- c(c,names(b[y])) # selects programs iteratively from top-down. Only selects programs that have a intersection smaller than 10 with a previously selected programs
}
final_filter <- c(final_filter, c)
} else {
final_filter <- c(final_filter, names(b))
}
}
return(final_filter)
}
## Create list of NMF matrics where each sample is an entry
prog_genes_ls <- list()
for(i in seq_along(sample_ls)) {
nmf_obj <- readRDS(paste(path, sample_ls[[i]], sep = "/"))
samp_name <- stringr::str_split(sample_ls[[i]], pattern = "_")[[1]][[1]]
nmf_mats <- c()
for(j in names(nmf_obj$fit)) {
get_basis <- basis(nmf_obj$fit[[j]])
colnames(get_basis) <- paste0(samp_name, ".", j, ".", 1:j)
nmf_mats <- cbind(nmf_mats, get_basis)
}
prog_genes_ls[[i]] <- nmf_mats
names(prog_genes_ls)[i] <- paste0(samp_name)
rm(nmf_obj, nmf_mats, get_basis)
}
Genes_nmf_w_basis <- do.call(c, list(prog_genes_ls))
# Find robust NMFs
# get gene programs (top 50 genes by NMF score)
nmf_programs_sig <- lapply(prog_genes_ls, function(x) apply(x, 2, function(y) names(sort(y, decreasing = T))[1:50]))
# for each sample, select robust NMF programs (i.e. observed using different ranks in the same sample), remove redundancy due to multiple ranks, and apply a filter based on the similarity to programs from other samples.
nmf_filter_all <- robust_nmf_programs(nmf_programs_sig, intra_min = 35, intra_max = 10, inter_filter=T, inter_min = 10)
nmf_programs_sig <- lapply(nmf_programs_sig, function(x) x[, is.element(colnames(x), nmf_filter_all),drop=F])
nmf_programs_sig <- do.call(cbind, nmf_programs_sig)
#leiden clusters
all_leiden_programs <- do.call(cbind, all_leiden_programs)
nmf_programs_sig<-cbind(nmf_programs_sig,all_leiden_programs)
# calculate similarity between programs
nmf_intersect <- apply(nmf_programs_sig , 2, function(x) apply(nmf_programs_sig , 2, function(y) length(intersect(x,y))))
# hierarchical clustering of the similarity matrix
nmf_intersect_hc_all <- hclust(as.dist(50-nmf_intersect), method="average")
nmf_intersect_hc_all <- reorder(as.dendrogram(nmf_intersect_hc_all), colMeans(nmf_intersect))
nmf_intersect <- nmf_intersect[order.dendrogram(nmf_intersect_hc_all), order.dendrogram(nmf_intersect_hc_all)]
nmf_intersect<-readRDS("MP/NMF/nmf_intersect_124.RDS")
nmf_programs_sig<-readRDS("MP/NMF/nmf_programs_sig_124.RDS")
### use a clustering approach that updates MPs in each iteration
nmf_intersect_KEEP <- nmf_intersect
nmf_programs_sig_KEEP <- nmf_programs_sig
### Parameters (later change to function form)v1-keep!
Min_intersect_initial <- 12 # the minimal intersection cutoff for defining the Founder NMF program of a cluster
Min_intersect_cluster <- 12 # the minimal intersection cuttof for adding a new NMF to the forming cluster
Min_group_size <- 4 # the minimal group size to consider for defining the Founder_NMF of a MP
Sorted_intersection <- sort(apply(nmf_intersect , 2, function(x) (length(which(x>=Min_intersect_initial))-1) ) , decreasing = TRUE)
Cluster_list <- list() ### Every entry contains the NMFs of a chosec cluster
k <- 1
Curr_cluster <- c()
MP_list <- list()
while (Sorted_intersection[1]>Min_group_size) {
Curr_cluster <- c(Curr_cluster , names(Sorted_intersection[1]))
### intersection between all remaining NMFs and Genes in MP
Genes_MP <- nmf_programs_sig[,names(Sorted_intersection[1])] # initial genes are those in the first NMF. Genes_MP always has only 50 genes consisting of the current MP
nmf_programs_sig <- nmf_programs_sig[,-match(names(Sorted_intersection[1]) , colnames(nmf_programs_sig))] # remove selected NMF
Intersection_with_Genes_MP <- sort(apply(nmf_programs_sig, 2, function(x) length(intersect(Genes_MP,x))) , decreasing = TRUE) # intersection between all other NMFs and Genes_MP
NMF_history <- Genes_MP # has all genes in all NMFs in the current cluster, for newly defining Genes_MP after adding a new NMF
### Create gene list - composed of intersecting genes in descending order. Update Curr_cluster each time.
while ( Intersection_with_Genes_MP[1] >= Min_intersect_cluster) { ### Define current cluster
Curr_cluster <- c(Curr_cluster , names(Intersection_with_Genes_MP)[1])
Genes_MP_temp <- sort(table(c(NMF_history , nmf_programs_sig[,names(Intersection_with_Genes_MP)[1]])), decreasing = TRUE) ## Genes_MP is newly defined each time according to all NMFs in the current cluster
Genes_at_border <- Genes_MP_temp[which(Genes_MP_temp == Genes_MP_temp[50])] ### genes with overlap equal to the 50th gene
if (length(Genes_at_border)>1){
### Sort last genes in Genes_at_border according to maximal NMF gene scores
### Run over all NMF programs in Curr_cluster and extract NMF scores for each gene
Genes_curr_NMF_score <- c()
for (i in Curr_cluster) {
curr_study <- strsplit(i, "[.]")[[1]][[1]]
Q <- Genes_nmf_w_basis[[curr_study]][ match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]])))[!is.na(match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]]))))] ,i]
#names(Q) <- names(Genes_at_border[!is.na(match(names(Genes_at_border),toupper(rownames(Genes_nmf_w_basis[[curr_study]]))))]) ### sometimes when adding genes the names do not appear
Genes_curr_NMF_score <- c(Genes_curr_NMF_score, Q )
}
Genes_curr_NMF_score_sort <- sort(Genes_curr_NMF_score , decreasing = TRUE)
Genes_curr_NMF_score_sort <- Genes_curr_NMF_score_sort[unique(names(Genes_curr_NMF_score_sort))] ## take only the maximal score of each gene - which is the first entry after sorting
Genes_MP_temp <- c(names(Genes_MP_temp[which(Genes_MP_temp > Genes_MP_temp[50])]) , names(Genes_curr_NMF_score_sort))
} else {
Genes_MP_temp <- names(Genes_MP_temp)[1:50]
}
NMF_history <- c(NMF_history , nmf_programs_sig[,names(Intersection_with_Genes_MP)[1]])
Genes_MP <- Genes_MP_temp[1:50]
nmf_programs_sig <- nmf_programs_sig[,-match(names(Intersection_with_Genes_MP)[1] , colnames(nmf_programs_sig))] # remove selected NMF
Intersection_with_Genes_MP <- sort(apply(nmf_programs_sig, 2, function(x) length(intersect(Genes_MP,x))) , decreasing = TRUE) # intersection between all other NMFs and Genes_MP
}
Cluster_list[[paste0("Cluster_",k)]] <- Curr_cluster
MP_list[[paste0("MP_",k)]] <- Genes_MP
k <- k+1
nmf_intersect <- nmf_intersect[-match(Curr_cluster,rownames(nmf_intersect) ) , -match(Curr_cluster,colnames(nmf_intersect) ) ] # remove current chosen cluster
Sorted_intersection <- sort(apply(nmf_intersect , 2, function(x) (length(which(x>=Min_intersect_initial))-1) ) , decreasing = TRUE) ### Sort intersection of remaining NMFs not included in any of the previous clusters
Curr_cluster <- c()
print(dim(nmf_intersect)[2])
}
#### ***** Sort Jaccard similarity plot according to new clusters:
inds_sorted <- c()
for (j in 1:length(Cluster_list)){
inds_sorted <- c(inds_sorted , match(Cluster_list[[j]] , colnames(nmf_intersect_KEEP)))
}
inds_new <- c(inds_sorted , which(is.na( match(1:dim(nmf_intersect_KEEP)[2],inds_sorted)))) ### combine inds in clusters with inds without clusters
# plot re-ordered similarity matrix heatmap
nmf_intersect_meltI_NEW <- reshape2::melt(nmf_intersect_KEEP[inds_new,inds_new])
custom_magma <- c(colorRampPalette(c("white", rev(magma(323, begin = 0.15))[1]))(10), rev(magma(323, begin = 0.18)))
ggplot(data = nmf_intersect_meltI_NEW, aes(x=Var1, y=Var2, fill=100*value/(100-value), color=100*value/(100-value))) +
geom_tile() +
scale_color_gradient2(limits=c(2,25), low=custom_magma[1:111], mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 13.5, oob=squish, name="Similarity\n(Jaccard index)") +
scale_fill_gradient2(limits=c(2,25), low=custom_magma[1:111], mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 13.5, oob=squish, name="Similarity\n(Jaccard index)") +
theme( axis.ticks = element_blank(), panel.border = element_rect(fill=F), panel.background = element_blank(), axis.line = element_blank(), axis.text = element_text(size = 11), axis.title = element_text(size = 12), legend.title = element_text(size=11), legend.text = element_text(size = 10), legend.text.align = 0.5, legend.justification = "bottom") +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
guides(fill = guide_colourbar(barheight = 4, barwidth = 1))
ggplot(data = nmf_intersect_meltI_NEW, aes(x=Var1, y=Var2, fill=100*value/(100-value), color=100*value/(100-value))) +
geom_tile() +
scale_color_gradient2(limits=c(2,30), low=custom_magma[1:111], mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 16, oob=squish, name="Similarity\n(Jaccard index)") +
scale_fill_gradient2(limits=c(2,30), low=custom_magma[1:111], mid =custom_magma[112:222], high = custom_magma[223:333], midpoint = 16, oob=squish, name="Similarity\n(Jaccard index)") +
theme( axis.ticks = element_blank(), panel.border = element_rect(fill=F), panel.background = element_blank(), axis.line = element_blank(), axis.text = element_text(size = 11), axis.title = element_text(size = 12), legend.title = element_text(size=11), legend.text = element_text(size = 10), legend.text.align = 0.5, legend.justification = "bottom") +
theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
guides(fill = guide_colourbar(barheight = 4, barwidth = 1))
下一步我们就需要考虑空间注释的问题
生活很好,有你更好