2024-07-23 contigs水平基因注释转化成染色体水平基因注释

1. 原因:利用gmaps回帖基因,会损失和错误很多,导致busco结果降低。所以可以直接基于挂载生成的agp文件直接进行转化,不损失任何基因。

#!/usr/bin/env python

##usage:python convert_gff_with_agp.py contig.gene.gff3 (contig水平基因注释结果) Chr.agp (contigs染色体水平挂载结果) Chr.gene.gff3 (染色体水平基因注释结果)

import sys

def convert_file(in_gff3, in_agp, out_gff):

ctg_on_chr = {}

with open(in_agp, 'r') as fin:

for line in fin:

if line.strip() == '' or line[0] == "#":

continue

data = line.strip().split()

if data[4] == 'U':

continue

chrn = data[0]

start_pos = int(data[1])

end_pos = int(data[2])

ctg = data[5]

direct = data[-1]

ctg_on_chr[ctg] = [chrn, start_pos, end_pos, direct]

gff_db = {}

with open(in_gff3, 'r') as fin:

for line in fin:

if line[0] == '#' or len(line.strip()) == 0:

continue

data = line.strip().split()

ctg = data[0]

sp = int(data[3])

ep = int(data[4])

if ctg not in ctg_on_chr:

continue

chrn, csp, cep, cdir = ctg_on_chr[ctg]

if cdir == '+':

nsp = csp+sp-1

nep = csp+ep-1

else:

nep = cep-sp+1

nsp = cep-ep+1

data[6] = '+' if data[6] == '-' else '-'

data[0] = chrn

data[3] = str(nsp)

data[4] = str(nep)

if data[2] == 'gene':

gsp = nsp

info = '\t'.join(data)

if chrn not in gff_db:

gff_db[chrn] = {}

if gsp not in gff_db[chrn]:

gff_db[chrn][gsp] = []

gff_db[chrn][gsp].append(info)

with open(out_gff, 'w') as fout:

for chrn in sorted(gff_db):

for gsp in sorted(gff_db[chrn]):

fout.write('\n'.join(gff_db[chrn][gsp]))

fout.write("\n###\n")

if __name__ == "__main__":

if len(sys.argv) < 3:

print("Usage: python "+sys.argv[0]+" <in_gff3> <in_agp> <out_gff>")

else:

in_gff3, in_agp, out_gff = sys.argv[1:]

convert_file(in_gff3, in_agp, out_gff)

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容