Github官网里的介绍:
SPRING ( https://doi.org/10.1093/bioinformatics/btx792 ) 是预处理脚本的集合和基于 Web 浏览器的工具,用于可视化高维数据并与之交互。在此处查看示例数据集。SPRING 是为单细胞 RNA-Seq 数据开发的,但可以更广泛地应用。最小输入是高维数据点(细胞)矩阵和维名称(基因)列表
高维数据的低维可视化通常是不完美的。SPRING 不是试图呈现单个细胞数据的单一“确定性”视图,而是允许探索多种可视化,以开发数据结构的直觉。SPRING 的核心是创建数据点的 k 最近邻 (kNN) 图,并使用力导向布局在 2D 中可视化该图。
环境配置
conda create python=2.7 -n SPRING
source /home/miniconda3/bin/activate SPRING
conda install scikit-learn numpy scipy matplotlib h5py
pip install networkx fa2 python-louvain
分析使用
参考数据集和脚本
下载一个案例看看:
wget https://github.com/AllonKleinLab/SPRING_dev/blob/master/data_prep/spring_example_pbmc4k.ipynb
主目录应包含以下文件:
matrix.mtx
counts_norm_sparse_cells.hdf5
counts_norm_sparse_genes.hdf5
genes.txt
以及每个 SPRING 图的一个子目录:
categorical_coloring_data.json
cell_filter.npy
cell_filter.txt
color_data_gene_sets.csv
color_stats.json
coordinates.txt
edges.csv
graph_data.json
run_info.json
安装jupyter notebook
conda install jupyter
下载SPRING的github包
git clone https://git@github.com/AllonKleinLab/SPRING.git
sudo chmod -R a+w SPRING
cd SPRING
测试是否完成部署
- 通过输入启动本地服务器
python -m SimpleHTTPServer 8000 &
- 然后在网络浏览器(最好是 Chrome)中转到 http://192.168.3.33:8000/springViewer.html?datasets/centroids
说明环境搭建基本完成
然后在R中,用自己Seurat对象的数据矩阵、便签进行预处理转换成输入文件
# 保存矩阵
library(data.table)
setwd('./SPRING/example_inputs/project/')
tt<-data.frame(seu@assays$RNA@counts)
tt[1:10, 1:10]
fwrite(tt, file = 'seu_mtx.xls', quote = F, sep = '\t', row.names = T, col.names = T)
# 保存Seurat分群、样本来源、细胞类型标签
write.table(data.frame(orig.ident=as.character(seu$orig.ident),
cluster=as.character(seu$seurat_clusters)),
file = 'orig.ident_list.xls', sep = '\t',
quote = F, col.names = T, row.names = F)
数据预处理
启动Jupyer Notebook
jupyter notebook --ip 192.168.3.33 --port 8889 &
打开浏览器输入并用Token登录:
192.168.3.33:8889
运行Python代码:
import pickle, numpy as np
# Import SPRING helper functions
from preprocessing_python import *
#矩阵导入
import numpy as np
import pandas as pd
mtx = pd.read_csv('./example_inputs/project/seu_mtx.xls',
delimiter='\t', encoding='utf-8')
#矩阵调整格式
mtx.index = mtx.iloc[:,:1]['Unnamed: 0']
mtx.index
mtx = mtx.iloc[:, 1:]
mtx.head()
gene_list = list(mtx.index)
gene_list[1:10]
#矩阵转置
mtx = mtx.transpose()
# 转换为numpy数组
mtx = mtx.to_numpy()
# 数据过滤、PCA降维, kNN聚类
# Filter out cells with fewer than 1000 UMIs
print 'Filtering cells'
mtx,cell_filter = filter_cells(mtx, 1000)
# Normalize gene expression data
# Only use genes that make up <
print 'Row-normalizing'
mtx = row_normalize(mtx)
# Filter genes with mean expression < 0.1 and fano factor < 3
print 'Filtering genes'
_,gene_filter = filter_genes(mtx,0.1,3)
# Z-score the gene-filtered expression matrix and do PCA with 20 pcs
print 'Zscoring and PCA'
Epca = get_PCA(Zscore(mtx[:,gene_filter]),20)
# get euclidean distances in the PC space
print 'Getting distance matrix'
D = get_distance_matrix(Epca)
# 导入标签信息
tt = pd.read_csv('./example_inputs/project/orig.ident_list.xls', delimiter='\t')
orig_ident = list(tt['orig.ident'])
cluster_list = list(tt['cluster'])
celltype=list(tt['celltype'])
# 整合成字典
cell_groupings = dict() #'cluster':cluster_list, 'timepoint':orig_ident
cell_groupings['cluster'] = cluster_list
cell_groupings['timepoint'] = orig_ident
cell_groupings['celltype'] = celltype
# 生成网页可视化的输入文件夹里面的文件
# load additional data (gene_list, cell_groupings, custom_colors)
# gene_list is a list of genes with length E.shape[1]
# cell_groupings is a dict of the form: { <grouping_name> : [<cell1_label>, <cell2_label>,...] }
# a "grouping" could be the sample id, cluster label, or any other categorical variable
# custom_colors is a dict of the form { <color_track_name> : [<cell1_value>, <cell2_value>,...] }
# a "custom color" is any continuous variable that you would like to use for coloring cels.
# gene_list, cell_groupings, custom_colors = pickle.load(open('example_inputs/python_data.p'))
# save a SPRING plots with k=5 edges per node in the directory "datasets/frog_python/"
# coarse graining can also be performed using the optional coarse_grain_X parameter
print 'Saving SPRING plot'
save_spring_dir(mtx,D,5,gene_list,'datasets/project',
cell_groupings=cell_groupings,
# custom_colors=custom_colors,
coarse_grain_X=1)
在浏览器中可视化数据并进行下载
打开浏览器,输入:http://192.168.3.33:8000/springViewer.html?datasets/project
后面就可以进行对单个基因表达量的可视化
以及把自己的标签映射上去
然后是下载保存图片
总结
- 个人认为这个SPRING算法效果不是特别好,尤其针对几个样本合并的情况
- 即使原始的输入数据不变,但是每一次运行出来的图像样子都是不一样的,没办法复现(按作者的说法是“Low-dimensional visualizations of high-dimensional data are usually imperfect. Rather than attempting to present a single ‘definitive’ view of single cell data, SPRING allows exploration of multiple visualizations in order to develop an intuition for data structure. ”)
- 更多功能和坑后面再摸索
==================================================================
后续dev开发板踩坑
环境搭建差不多,遇到没有的库pip安装即可
官网:https://github.com/AllonKleinLab/SPRING_dev
https://github.com/AllonKleinLab/SPRING_dev.git
#创建conda环境后启动环境
source path/bin/activate SPRING_dev
cd ./SPRING_dev
#启动Jupyter
jupyter notebook --ip 192.168.3.33 --port 8899 &
打开浏览器输入 192.168.3.33:8899
数据预处理
#import modules, define some functions for loading, saving and processing a gene-barcode matrix
%matplotlib inline
import pkg_resources
import collections
import numpy as np
import pandas as pd
import scipy.sparse as sp_sparse
import tables
import matplotlib
import matplotlib.pyplot as plt
import h5py
np.random.seed(0)
GeneBCMatrix = collections.namedtuple('GeneBCMatrix', ['gene_ids', 'gene_names', 'barcodes', 'matrix'])
def get_matrix_from_h5(filename, genome):
with tables.open_file(filename, 'r') as f:
try:
dsets = {}
for node in f.walk_nodes('/' + genome, 'Array'):
dsets[node.name] = node.read()
matrix = sp_sparse.csc_matrix((dsets['data'], dsets['indices'], dsets['indptr']), shape=dsets['shape'])
if 'id' in dsets.keys():
return GeneBCMatrix(dsets['id'], dsets['name'], dsets['barcodes'], matrix)
else:
return GeneBCMatrix(dsets['genes'], dsets['gene_names'], dsets['barcodes'], matrix)
except tables.NoSuchNodeError:
raise Exception("Genome %s does not exist in this file." % genome)
except KeyError:
raise Exception("File is missing one or more required datasets.")
def save_matrix_to_h5(gbm, filename, genome):
flt = tables.Filters(complevel=1)
with tables.open_file(filename, 'w', filters=flt) as f:
try:
group = f.create_group(f.root, genome)
f.create_carray(group, 'genes', obj=gbm.gene_ids)
f.create_carray(group, 'gene_names', obj=gbm.gene_names)
f.create_carray(group, 'barcodes', obj=gbm.barcodes)
f.create_carray(group, 'data', obj=gbm.matrix.data)
f.create_carray(group, 'indices', obj=gbm.matrix.indices)
f.create_carray(group, 'indptr', obj=gbm.matrix.indptr)
f.create_carray(group, 'shape', obj=gbm.matrix.shape)
except:
raise Exception("Failed to write H5 file.")
def subsample_matrix(gbm, barcode_indices):
return GeneBCMatrix(gbm.gene_ids, gbm.gene_names, gbm.barcodes[barcode_indices], gbm.matrix[:, barcode_indices])
def get_expression(gbm, gene_name):
gene_indices = np.where(gbm.gene_names == gene_name)[0]
if len(gene_indices) == 0:
raise Exception("%s was not found in list of gene names." % gene_name)
return gbm.matrix[gene_indices[0], :].toarray().squeeze()
def save_count_matrix_to_h5(E, gene_list, filename):
with h5py.File(filename, 'w') as hf:
hf.create_dataset("indptr" , data= E.indptr)
hf.create_dataset("indices", data= E.indices)
hf.create_dataset("data" , data= E.data)
hf.create_dataset("shape" , data= E.shape)
hf.create_dataset("genes" , data= gene_list)
%pylab inline
from helper_functions import *
from collections import defaultdict
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'DejaVu Sans'
plt.rc('font', size=14)
plt.rcParams['pdf.fonttype'] = 42
sample_list = ["sample1", "sample2"]
# D stores all the data; one entry per library
D = {}
for j, s in enumerate(sample_list):
print(j, s)
# D stores all the data; one entry per library
D = {}
for j, s in enumerate(sample_list):
filename = './Project/'+ s +'/outs/raw_feature_bc_matrix.h5'
raw_matrix_h5 = filename
print raw_matrix_h5
if filename == './Project/'+ s +'/outs/raw_feature_bc_matrix.h5':
genome = "matrix"
else:
genome = 'GRCh38'
D[s] = {}
D[s]['meta'] = {}
gbm = get_matrix_from_h5(raw_matrix_h5, genome)
D[s]['E'] = transpose(gbm.matrix)
D[s]['meta']['gene_list']=gbm.gene_names
D[s]['meta']['gene_id']=gbm.gene_ids
print D[s]['E'].shape
D[s]['cell_index']=gbm.barcodes
print (len(D[s]['cell_index']))
gene_list = D[s]['meta']['gene_list']
gene_id = D[s]['meta']['gene_id']
Filter cells by total counts and number of genes detected
# plot total counts histograms - don't actually filter out any barcodes yet
# adjust total counts thresholds
for j,s in enumerate(sample_list):
D[s]['total_counts'] = np.sum(D[s]['E'], axis=1).A[:,0]
D[s]['genes_detected'] = np.sum(D[s]['E']>0, axis=1).A[:,0]
D[s]['meta']['min_tot']= np.mean(D[s]['total_counts'])+np.std(D[s]['total_counts'])
D[s]['meta']['min_genes_detected']=200;
fig3 = plt.figure()
ax3 = fig3.add_subplot(111)
ax3.set_xlabel('Transcripts per barcode (log10)')
ax3.set_ylabel('genes detected')
ax3.scatter(log(1 + D[s]['total_counts']),D[s]['genes_detected']);
ax3.plot(ax3.get_xlim(),[D[s]['meta']['min_genes_detected'],D[s]['meta']['min_genes_detected']]);
title(s)
D_orig=D
for j,s in enumerate(sample_list):
ix = (D[s]['genes_detected'] > 200) & (D[s]['total_counts'] > 500)
if np.sum(ix) > 0:
print s, np.sum(ix), '/', D[s]['E'].shape[0]
# Actually filter out low-count barcodes
for j,s in enumerate(sample_list):
print '--- %s ---' %s
print 'Pre-filter: %i barcodes' %D[s]['E'].shape[0]
tmpfilt = np.nonzero((D[s]['genes_detected'] > 200) & (D[s]['total_counts'] > 500))[0]
D[s] = filter_dict(D[s], tmpfilt)
print 'Post-filter: %i barcodes' %D[s]['E'].shape[0]
Filter cells by mito fraction
# get mitochondrial genes
mt_ix = [i for i,g in enumerate(gene_list) if g.startswith('MT-')]
print [gene_list[i] for i in mt_ix]
# plot mito-gene frac histograms - don't actually filter out any cells yet
# set mito-gene frac threshold
for j,s in enumerate(sample_list):
D[s]['meta']['max_mt'] = 0.2
for j,s in enumerate(sample_list):
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, xscale='linear', yscale='linear',
xlabel='MT frac.', ylabel='no. cells')
D[s]['mito_frac'] = np.sum(D[s]['E'][:,mt_ix], axis=1).A[:,0] / np.sum(D[s]['E'], axis=1,dtype=float).A[:,0]
ax.hist(D[s]['mito_frac'], cumulative=False,
bins=np.linspace(0, 1, 30))
ax.plot([D[s]['meta']['max_mt'],D[s]['meta']['max_mt']],ax.get_ylim());
title(s)
print D[s]['E'].shape[0], np.sum(D[s]['mito_frac'] <= D[s]['meta']['max_mt'])
# Actually filter out mito-high cells
D_filt=D
for s in sample_list:
print '--- %s ---' %s
print 'Pre-filter: %i barcodes' %D[s]['E'].shape[0]
tmpfilt = np.nonzero(D[s]['mito_frac'] <= D[s]['meta']['max_mt'])[0]
D[s] = filter_dict(D[s], tmpfilt)
print 'Post-filter: %i barcodes' %D[s]['E'].shape[0]
Merge data, normalize
# create master dataset (all SPRING subsets will refer back to this)
samp_lookup = {}
samp_id_flat = np.array([],dtype=str)
for s in D.keys():
samp_id_flat = np.append(samp_id_flat, [s] * D[s]['E'].shape[0])
E = scipy.sparse.lil_matrix((len(samp_id_flat), len(gene_list)), dtype=int)
total_counts = np.zeros(len(samp_id_flat), dtype=int)
mito_frac = np.zeros(len(samp_id_flat), dtype=float)
genes_detected = np.zeros(len(samp_id_flat), dtype=int)
for s in D.keys():
print s
E[samp_id_flat == s, :] = D[s]['E']
total_counts[samp_id_flat == s] = D[s]['total_counts']
mito_frac[samp_id_flat == s] = D[s]['mito_frac']
genes_detected[samp_id_flat == s] = D[s]['genes_detected']
E = E.tocsc()
E_full=E
# remove genes that are not expressed by any cells
# optionally remove mito and rps genes
gene_list = D[sample_list[0]]['meta']['gene_list']
gene_id = D[sample_list[0]]['meta']['gene_id']
remove_crapgenes = 1
if remove_crapgenes:
import re
mt_ix = [i for i,g in enumerate(gene_list) if g.startswith('MT-')]
rp_ix = [i for i,g in enumerate(gene_list) if g.startswith('RP1')
or g.startswith('RP2') or g.startswith('RP3')
or g.startswith('RP4') or g.startswith('RP5')
or g.startswith('RP6') or g.startswith('RP7')
or g.startswith('RP8') or g.startswith('RP9')
or g.startswith('RPL') or g.startswith('RPS')
]
#or g.startswith('RP4','RP5','RP6') or g.startswith('RP7','RP8','RP9')]
keep_genes = (E.sum(0) > 0).A.squeeze() #* rp_ix * mt_ix
keep_genes[rp_ix] = 0
keep_genes[mt_ix] = 0
print sum(keep_genes), '/', len(keep_genes)
else:
keep_genes = (E.sum(0) > 0).A.squeeze()
print sum(keep_genes), '/', len(keep_genes)
# normalize by total counts
E = E[:,keep_genes]
E = tot_counts_norm_sparse(E)[0]
print shape(E)
gene_list = gene_list[keep_genes]
gene_id = gene_id[keep_genes]
merged_list=[x+"_"+gene_id[i] for i,x in enumerate(gene_list)]
Save base directory files
main_spring_dir=os.getcwd()
if not os.path.exists(main_spring_dir):
os.makedirs(main_spring_dir)
# Option to save the barcode information for later use
BCR = 1
if BCR:
barcode_dir = main_spring_dir + "/barcodes/"
if not os.path.exists(barcode_dir):
os.makedirs(barcode_dir)
for s in sample_list:
print '--- %s ---' %s
np.savetxt(barcode_dir + s, D[s]['cell_index'], fmt='%s')
# option to save mitochondrial percentages for later use
MT = 1
if MT:
MT_dir = main_spring_dir + "/mito/"
if not os.path.exists(MT_dir):
os.makedirs(MT_dir)
for s in sample_list:
print '--- %s ---' %s
np.savetxt(MT_dir + s, D[s]['mito_frac'], fmt='%s')
gene_list_new=np.array(merged_list)
np.savetxt(main_spring_dir + '/genes.txt', merged_list, fmt='%s')
# save master expression matrix in hdf5 format
import h5py
print 'Saving hdf5 file for fast gene loading...'
E = E.tocsc()
hf = h5py.File(main_spring_dir + '/counts_norm_sparse_genes.hdf5', 'w')
counts_group = hf.create_group('counts')
cix_group = hf.create_group('cell_ix')
hf.attrs['ncells'] = E.shape[0]
hf.attrs['ngenes'] = E.shape[1]
for iG, g in enumerate(merged_list):
if iG % 3000 == 0:
print g, iG, '/', len(merged_list)
counts = E[:,iG].A.squeeze()
cell_ix = np.nonzero(counts)[0]
counts = counts[cell_ix]
counts_group.create_dataset(g, data = counts)
cix_group.create_dataset(g, data = cell_ix)
hf.close()
##############
print 'Saving hdf5 file for fast cell loading...'
E = E.tocsr()
hf = h5py.File(main_spring_dir + '/counts_norm_sparse_cells.hdf5', 'w')
counts_group = hf.create_group('counts')
gix_group = hf.create_group('gene_ix')
hf.attrs['ncells'] = E.shape[0]
hf.attrs['ngenes'] = E.shape[1]
for iC in range(E.shape[0]):
if iC % 3000 == 0:
print iC, '/', E.shape[0]
counts = E[iC,:].A.squeeze()
gene_ix = np.nonzero(counts)[0]
counts = counts[gene_ix]
counts_group.create_dataset(str(iC), data = counts)
gix_group.create_dataset(str(iC), data = gene_ix)
hf.close()
写入矩阵
scipy.io.mmwrite(main_spring_dir + '/matrix.mtx', E)
Save merged samples
merge_setup = {'/Dataset_v1':sample_list}
for s, smerge in merge_setup.items():
print smerge
t0 = time.time()
print t0
print '________________', s
cell_ix = np.in1d(samp_id_flat, smerge)
run_all_spring_1_6(E[cell_ix,:],
list(merged_list), s, main_spring_dir, normalize=False, tot_counts_final = total_counts[cell_ix],
min_counts = 3, min_cells = 3, min_vscore_pctl = 90,
show_vscore_plot = True, num_pc = 60, pca_method = '', k_neigh=4, use_approxnn = False,
output_spring = True, num_force_iter = 2000,
cell_groupings = {'Sample': list(samp_id_flat[cell_ix])})
print time.time() - t0
np.save(main_spring_dir + s + '/cell_filter.npy', np.nonzero(cell_ix)[0])
np.savetxt(main_spring_dir + s + '/cell_filter.txt', np.nonzero(cell_ix)[0], fmt='%i')
print time.time() - t0
SignacX包进行分群和标签生成
#install.packages('SignacX')
library(SignacX)
# dir is the subdirectory generated by the Jupyter notebook; it is the directory that contains the 'categorical_coloring_data.json' file.
dir = "./Dataset_v1/"
# load the expression data
E = CID.LoadData(dir)
# generate cellular phenotype labels
labels = Signac(E, spring.dir = dir, num.cores = 14)
# alternatively, if you're in a hurry, use:
# labels = SignacFast(E, spring.dir = dir, num.cores = 4)
celltypes = GenerateLabels(labels, E = E, spring.dir = dir)
# write cell types and Louvain clusters to SPRING
dat <- CID.writeJSON(celltypes, spring.dir = dir)
添加自己的标签(略)
网页可视化
python -m SimpleHTTPServer 8080 &