依赖biopyhon包
pip install biopython
首先获得当前基因组文件的序列名,以便后续编辑。注意,生成的list.txt文件原编辑为等下提到的三列格式。
seqkit fx2tab -n genome.fa > list.txt
import argparse
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# 创建命令行参数解析器
parser = argparse.ArgumentParser(description="Adjust sequence order and direction based on a list file.")
# 添加命令行参数
parser.add_argument("-g", "--genome", help="Path to the genome.fa file", required=True)
parser.add_argument("-l", "--list", help="Path to the list.txt file", required=True)
parser.add_argument("-o", "--output", help="Output file name", required=True)
# 添加使用帮助
parser.epilog = """
Example usage:
python adjust_sequence.py -g genome.fa -l list.txt -o adjusted_genome_modified.fa
"""
# 解析命令行参数
args = parser.parse_args()
# 读取genome.fa文件
genome_records = SeqIO.to_dict(SeqIO.parse(args.genome, "fasta"))
# 读取list.txt文件
with open(args.list, "r") as file:
listed_sequences = []
for line in file:
line = line.strip().split("\t")
seq_name = line[0]
direction = int(line[1])
new_seq_name = line[2]
# 获取序列
seq = genome_records[seq_name].seq
# 根据方向调整序列
if direction == 2:
seq = seq.reverse_complement()
# 创建新的序列记录
new_record = SeqRecord(seq, id=new_seq_name, description="")
# 将新的序列记录添加到字典中
genome_records[new_seq_name] = new_record
# 记录已列出的序列
listed_sequences.append(new_seq_name)
# 将未列出的序列附加在列出序列的后面
for seq_name, record in genome_records.items():
if seq_name not in listed_sequences:
genome_records[new_seq_name] = record
# 保存调整后的序列到新的fasta文件
SeqIO.write(genome_records.values(), args.output, "fasta")
用法
python adjust_sequence.py -g genome.fa -l list.txt -o adjusted_genome_modified.fa
其中list.txt分三列,第一列原序列名;第二列,如果是1就原方向,2就变为反向互补;第三列,新序列名。以制表符分隔
如
B04 2 B01
B08 2 B02
B07 1 B03
B06 2 B04
B02 2 B05
B05 1 B06
B01 2 B07
B03 2 B08