python 将多个细胞的表达量合并成一个假细胞的表达量。
import anndata
import matplotlib.pyplot as plt
import numpy as np
import scanpy as sc
import anndata as ad
import pandas as pd
import math
import re
def condense_cell(inadata=None,ingroup=None,insize=100,label='adata'):
meta=pd.DataFrame(inadata.obs)
meta=meta.sort_values(by=ingroup)
dfcount=meta[ingroup].value_counts().rename_axis('type').reset_index(name='counts')
dfcount.index=list(dfcount['type'])
lorder=list(set(meta[ingroup]))
lorder.sort()
dfcount=dfcount.loc[lorder,:]
lindex=[]
for i in dfcount['counts']:
tmp=list(range(0,i))
tmp=[math.ceil((i+1)/insize) for i in tmp]
lindex.extend(tmp)
meta['group']=meta[ingroup].astype('str')+'-'+['pseudocell_'+str(i) for i in lindex]
meta.to_csv(label+'_'+ingroup+'_'+str(insize)+'cells_condese_metadata.xls',sep='\t')
meta=meta.loc[list(inadata.obs.index),:]
inadata.obs['group']=meta['group']
grouped = inadata.to_df().groupby(inadata.obs['group']).mean()
new_adata = sc.AnnData(X=grouped.values, obs=pd.DataFrame(index=list(grouped.index)), var=inadata.var)
new_adata.obs['type']=[re.sub('-pseudocell_.*','',i) for i in list(new_adata.obs_names)]
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
new_adata.var["mt"] = new_adata.var_names.str.startswith("MT-")
# ribosomal genes
new_adata.var["ribo"] = new_adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
new_adata.var["hb"] = new_adata.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(
new_adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=False
)
new_adata.write(label+'_'+ingroup+'_'+str(insize)+'cells_condese.h5ad')
condense_cell(inadata=inadata,ingroup='group',insize=10,label='sc')
随机取样
def sample_adata(adata,ingroup='Celltype',pct=0.1,smin=50):
df=adata.obs
df['cell_barcode']=list(df.index)
df=df.groupby(ingroup).apply(lambda df: df.sample(frac=pct,random_state=2025,axis=0) if df.shape[0] > smin/pct else df.sample(n=smin,random_state=2025,axis=0)).reset_index(drop=True)
df.index=list(df['cell_barcode'])
adata=adata[list(df.index),:]
return adata
实例
adata1=sc.read('/data/mouse_ex.h5ad')
adata2=sample_adata(adata1,ingroup='Celltype',pct=0.1,smin=50)
合并基因:
def merge_gene(infile,gdf,specie=None,label='out',is_return=False):
ob1=sc.read(f1)
if sum(gdf['geneid'].isin(ob1.var_names)):
incol='geneid'
elif sum(gdf['gene_symbol'].isin(ob1.var_names)):
incol='gene_symbol'
else:
print('No genes match !')
gdf1=gdf.loc[gdf[incol].isin(ob1.var_names),:]
gdf1=gdf1.loc[gdf1['Specie'].str.contains(specie),:]
ob1=ob1[:,gdf1[incol]]
gdf1.index=list(gdf1[incol])
grouped=ob1.to_df().T.groupby(gdf1['Orthogroup']).sum().T
new_adata = sc.AnnData(X=sparse.csr_matrix(grouped.values),
obs=pd.DataFrame(index=list(grouped.index)),
var=list(grouped.columns))
new_adata.var_names=list(grouped.columns)
new_adata.var.columns=['Orthogroup']
new_adata.write(label+'_merge.h5ad')
if is_return:
return new_adata
实例
f1='/data/mouse.h5ad'
specie='mouse'
gdf=pd.read_csv('/data/offerfinder/geneid2symbol.csv',sep=',')
merge_gene(infile=f1,gdf=gdf,specie=specie,label='out')