注:脚本是用GPT生成的,请自己甄别。
- perl gff3_to_bed12.pl in.gff > out.bed12
cat gff3_to_bed12.pl
#!/usr/bin/perl
use strict;
use warnings;
# 用法:
# perl gff3_to_bed12.pl in.gff > out.bed12
#
# 要点:
# - 以 mRNA 的 ID 作为转录本键与 BED12 的 name(若有 Name 则优先)。
# - exon/CDS 通过 Parent 归并到对应 mRNA。
# - 坐标:GFF 1-based 闭区间 -> BED 0-based 半开(start-1, end 原样)。
# - 无 CDS 的转录本:thickStart=thickEnd=txStart(显示为非编码)。
# - blockSizes 与 blockStarts 不带尾随逗号。
my $gff = shift or die "Usage: $0 input.gff3 > output.bed12\n";
open my $IN, "<", $gff or die "Cannot open $gff: $!\n";
# 以转录本 ID 为键
my (%CHR, %STR, %NAME, %EXONS, %CDS_S, %CDS_E);
while (<$IN>) {
chomp;
next if /^#/;
my ($chr, undef, $type, $start, $end, undef, $strand, undef, $attr) = split(/\t/);
$type = lc $type;
# 解析属性为哈希(split 限制 2,兼容像 Class== 之类的情况)
my %A;
for my $kv (split /;/, $attr) {
next unless length $kv;
my ($k, $v) = split(/=/, $kv, 2);
next unless defined $k;
$A{$k} = defined $v ? $v : '';
}
# 转换为 0-based 半开
my $s0 = $start - 1;
my $e0 = $end;
# mRNA 或 transcript 行:记录基础信息
if ($type eq 'mrna' or $type eq 'transcript') {
my $tid = $A{ID} // next;
$CHR{$tid} = $chr;
$STR{$tid} = $strand;
$NAME{$tid} = $A{Name} // $tid;
}
# exon/CDS:按 Parent 归并
elsif ($type eq 'exon' or $type eq 'cds') {
my $parent = $A{Parent} // next;
for my $tid (split /,/, $parent) {
$tid =~ s/^\s+|\s+$//g;
# 若 mRNA 行在后面出现,先占位
$CHR{$tid} //= $chr;
$STR{$tid} //= $strand;
$NAME{$tid} //= $tid;
if ($type eq 'exon') {
push @{$EXONS{$tid}}, [$s0, $e0];
} else { # cds
$CDS_S{$tid} = $s0 if !defined $CDS_S{$tid} || $s0 < $CDS_S{$tid};
$CDS_E{$tid} = $e0 if !defined $CDS_E{$tid} || $e0 > $CDS_E{$tid};
}
}
}
}
close $IN;
# 输出 BED12
for my $tid (keys %EXONS) {
my @ex = sort { $a->[0] <=> $b->[0] } @{$EXONS{$tid}};
next unless @ex; # 必须有外显子
my $txS = $ex[0]->[0];
my $txE = $ex[-1]->[1];
my @blockSizes = map { $_->[1] - $_->[0] } @ex;
my @blockStarts = map { $_->[0] - $txS } @ex;
# CDS 范围(无 CDS 则设为非编码)
my ($thickS, $thickE) = (defined $CDS_S{$tid} && defined $CDS_E{$tid})
? ($CDS_S{$tid}, $CDS_E{$tid})
: ($txS, $txS);
my $chrom = $CHR{$tid} // 'chrN';
my $strand = $STR{$tid} // '+';
my $name = $NAME{$tid} // $tid;
my $sizes = join(",", @blockSizes); # 无尾逗号
my $starts = join(",", @blockStarts); # 无尾逗号
print join("\t",
$chrom, $txS, $txE, $name, 0, $strand,
$thickS, $thickE, 0,
scalar(@ex),
$sizes,
$starts
), "\n";
}