# -*- coding: utf-8 -*-
# 2020.01.09
# @zlk
# 将已经拿到的基因组染色体按照从长到短排列,还要把gff里的改了
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
import numpy
#老fasta文件
fa_file=sys.argv[1]
# 新fasta文件
new_fa=open(sys.argv[2],'w')
#老GFF文件
file3=sys.argv[3]
#新GFF文件
new_gff=open(sys.argv[4],'w')
#染色体开头
chr_name=sys.argv[5]
#未挂载的contig开头
contig_name=sys.argv[6]
def get_chr_length(file_name):
fa_dict = SeqIO.to_dict(SeqIO.parse(file_name, "fasta", IUPAC.unambiguous_dna))
length_list=[]
for i in fa_dict.keys():
length_list.append([i,len(fa_dict[i])])
return length_list
length=get_chr_length(fa_file)
length.sort(key=lambda x: int(x[1]),reverse=True)
index=1
change_dict={}
new_length=[]
for i in length:
if i[0].startswith(chr_name):
if index<10:
newname=chr_name+str(0)+str(index)
else:
newname = chr_name + str(index)
index+=1
i.append(newname)
new_length.append(i)
change_dict[i[0]]=newname
elif i[0].startswith(contig_name):
continue
#修改fa
fa_dict=SeqIO.to_dict(SeqIO.parse(fa_file, "fasta", IUPAC.unambiguous_dna))
for i in new_length:
new_fa.write('>'+i[2]+'\n')
new_fa.write(str(fa_dict[i[0]].seq)+'\n')
for key in fa_dict:
if key.startswith(contig_name):
new_fa.write('>'+key+'\n')
new_fa.write(str(fa_dict[key].seq)+'\n')
# 改gff
for i in new_length:
old_gff = open(file3, 'r')
for line in old_gff:
line_list=line.strip().split('\t')
if line_list[0]==i[0]:
new_gff.write(change_dict[line_list[0]]+'\t')
for j in line_list[1:]:
new_gff.write(j+'\t')
new_gff.write('\n')
old_gff.close()
old_gff=open(file3,'r')
for line in old_gff:
line_list = line.strip().split('\t')
if line_list[0] not in list(numpy.array(new_length)[:,0]):
new_gff.write(line)
python change.py old.fa new.fa old.gff new.gff chr contig