1 根据分组提取特定样本
2 提取特定样本的ko丰度表
3 把KO注释做成字典
4 计算0的数量,每组样本数,非0百分比;给字典
5 KO字典注释,没有KO的不注释(db需要更新)
#!/usr/bin/env python3
import pandas as pd
import numpy as np
import os, re, sys
# meta
meta = pd.read_table("../association_ko_species/meta_group.txt")
sample_g1 = meta[meta['Group'] == "G1"]['Sample']
sample_g2 = meta[meta['Group'] == "G2"]['Sample']
sample_g3 = meta[meta['Group'] == "G3"]['Sample']
sample_g4 = meta[meta['Group'] == "G4"]['Sample']
sample_g5 = meta[meta['Group'] == "G5"]['Sample']
sample_g6 = meta[meta['Group'] == "G6"]['Sample']
# ko
ko = pd.read_table("../association_ko_species/merge_out.txt",
index_col=0)
ko_tmp = pd.DataFrame(ko, columns=list(sample_g6)) ##
# ko_tmp = ko_tmp.iloc[0:10,:]
# ko annotation
file = "/public/home/zzumgg03/huty/databases/kofam/KEGG_KO.txt"
with open(file, 'r') as file:
Anno = {}
for line in file:
line = line.strip()
key = re.split(r'\t', line)[0]
Anno[key] = "\t".join(re.split(r'\t', line)[1:5])
# summary
zero = (ko_tmp == 0).astype(int).sum(axis=1)
sample = np.full(ko_tmp.shape[0], ko_tmp.shape[1], dtype='int')
Dict = {}
Dict['num_zero'] = np.array(zero) # number of zero
Dict['num_sample'] = np.array(sample) # number of samples
Dict['ko'] = list(ko_tmp.index) # ko id
Dict['percent'] = Dict['num_zero']/Dict['num_sample']
# anno geneset ko
Dict['Anno'] = []
for each in Dict['ko']:
if each in Anno.keys():
Dict['Anno'].append(Anno[each])
else:
Dict['Anno'].append("")
# save
with open("ko_percent_g6.txt", 'w') as o:
o.write("ko\tnum_zero\tnum_sample\tpercent\tlevel_1\tlevel_2\tpathway\tgene\n")
for i in range(0, len(Dict['ko'])):
o.write("{}\t{}\t{}\t{}\t{}\n".format(
Dict['ko'][i],
Dict['num_zero'][i],
Dict['num_sample'][i],
Dict['percent'][i],
Dict['Anno'][i]))
运行
##
nohup python3 sc_ko_percent_anno_g1.py &
nohup python3 sc_ko_percent_anno_g2.py &
nohup python3 sc_ko_percent_anno_g3.py &
nohup python3 sc_ko_percent_anno_g4.py &
nohup python3 sc_ko_percent_anno_g5.py &
nohup python3 sc_ko_percent_anno_g6.py &