1 输入文件
已经计算好的基因数量(多个文件,待统合)
genome list(所有基因组文件名)
the unique gene list of merged genome list (pan genome)
2 python3代码
思路:
读取gene genome list,去换行符,给列表
numpy.zeros(n) 制造n个0填充gene*genome二维表pandas.DataFrame(np.zeros().reshape(()), columns = , index = )构造二维表
遍历各个列表,按照基因名把数量赋给表格
#!/usr/bin/env python
import re,sys,os
import pandas as pd
import numpy as np
# 读取文件,去除换行符给新列表
# 读取需要判断PAV的gene list
with open("gene.list", 'r') as list_genes:
list_genes = list_genes.readlines()
list_genes_enter = []
for each in list_genes:
list_genes_enter.append(each.strip())
# 读取每个基因组的gene list
with open("genome.list", 'r') as list_genomes:
list_genomes = list_genomes.readlines()
list_genomes_enter = []
for each in list_genomes:
list_genomes_enter.append(each.strip())
# 构造数据框
num_row = len(list_genes_enter)
num_col = len(list_genomes_enter)
num_total = num_row * num_col
df = pd.DataFrame(np.zeros(num_total).reshape((num_row, num_col)),
columns = list_genomes_enter,
index = list_genes_enter)
# 遍历所有基因集,遍历所有行名(基因)是否存在于各基因集(CGR2),重新给表格赋值
route="/hwfssz1/ST_HEALTH/P18Z10200N0423/liuxudong/data/Cazy_results/All_overview/overview23/All_overview_express/All_overview_express_uniq_c_awk"
for each_genome in list_genomes_enter:
target_file = "{}/{}".format(route, each_genome)
# 读取基因集
with open(target_file, 'r') as target:
target = target.readlines()
for each_target in target:
# 遍历基因集,基因,数字
each_target = each_target.strip()
each_target_gene = re.split(r'\t', each_target)[0]
each_target_num = re.split(r'\t', each_target)[1]
# 判断行名基因是否在基因集,并给表格元素赋值
# loc: 字符定位表格元素
# iloc: 数字定位
df.loc[each_target_gene, each_genome] = each_target_num
print("\033[32m {} DONE!\033[0m".format(each_genome))
# 表格保存
df.to_csv('table.txt', sep='\t', index = True)