Orthofinder产生的Orthogroups.GeneCount.tsv不能直接作为CAFE软件的输入文件,需要做如下处理:
filter_large_gene_family.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
作者:徐诗芬
内容:1.cafe的输入格式:去掉total这一列,在第一列添加Description,然后这一列都为空
2.过滤掉一个或多个物种有超过200个基因拷贝的基因家族,size=200
日期:2021.2.3
"""
import sys
from itertools import islice
def usage():
print('Usage: python filter_large_gene_family.py [index_file] [size]')
def main():
ouf1 = open("cafe_gene_family_format.txt", 'wt')
ouf2 = open("cafe_large_gene_family_format.txt", 'wt')
size = eval(sys.argv[2])
with open(sys.argv[1], 'rt') as f:
for head in islice(f, 0, 1): #提取第一行内容
head = head.strip().split("\t")[:-1]
ouf1.write("Description\t")
ouf2.write("Description\t")
for h in head:
ouf1.write(h + "\t")
ouf2.write(h + "\t")
ouf1.write("\n")
ouf2.write("\n")
for line in islice(f, 0, None): # 跳过第一行按行读取文件内容
gene_count = line.strip().split("\t")[:-1]
family_name = gene_count[0]
gene_list = list(map(int, gene_count[1:]))
x = [i for i in gene_list if i > size] # 从列表中找出大于某个值的元素
if len(x) > 0:
ouf2.write("\t")
for n in gene_count:
ouf2.write(n)
ouf2.write("\t")
ouf2.write("\n")
else:
ouf1.write("\t")
for n in gene_count:
ouf1.write(n)
ouf1.write("\t")
ouf1.write("\n")
ouf1.close()
ouf2.close()
try:
main()
except IndexError:
usage()