适用于 NCBI 数据库中 gff 格式文件转换为 gtf 格式,使用 perl 代码实现:
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Data::Dumper;
my ($help, $infile, $outdir, $gff2gtf, $gene2tr);
GetOptions(
"infile=s" => \$infile,
"outdir:s" => \$outdir,
"gene2rt!" => \$gene2tr,
"gff2gtf!" => \$gff2gtf,
"help|?" => \$help
);
die &usage if (!defined $infile || defined $help);
$outdir ||= ".";system("mkdir -p $outdir");
if (defined $gff2gtf){
my ($gene, $gene_info, $mRNA, $exon, $cds) = &gff2gtf($infile);
foreach my $g (sort keys %$gene){
my $out_g = $gene_info->{$g};
my $gene_id = $gene_info->{$g}->[4];
my $gene_type = $gene_info->{$g}->[5];
print "$out_g->[0]\tGeneBang\tgene\t$out_g->[1]\t$out_g->[2]\t.\t$out_g->[3]\t.\tgene_id \"$gene_id\"; gi \"$g\"; gene_biotype \"$gene_type\";\n";
foreach my $m (sort keys %{$mRNA->{$g}}){
if (exists $mRNA->{$g}->{$m}->{'mRNA'}){
my $out_m = $mRNA->{$g}->{$m}->{'mRNA'};
my $transcript_id = $out_m->[4];
print "$out_m->[0]\tGeneBang\ttranscript\t$out_m->[1]\t$out_m->[2]\t.\t$out_m->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$transcript_id\"; gi \"$g\"; transcript_biotype \"$gene_type\";\n";
if (exists $exon->{$m}){
foreach my $e (sort keys %{$exon->{$m}}){
my $out_e = $exon->{$m}->{$e};
print "$out_e->[0]\tGeneBang\texon\t$out_e->[1]\t$out_e->[2]\t.\t$out_e->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$transcript_id\"; gi \"$g\"; transcript_biotype \"$gene_type\"; exon_id \"$e\"; \n";
}
}
if (exists $cds->{$m}){
foreach my $c (sort keys %{$cds->{$m}}){
my @out_pos = @{$cds->{$m}->{$c}->{pos}};
my $protein_id = $cds->{$m}->{$c}->{pep};
foreach my $pos (@out_pos){
print "$out_m->[0]\tGeneBang\tCDS\t$pos\t.\t$out_m->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$transcript_id\"; gi \"$g\"; transcript_biotype \"$gene_type\"; cds_id \"$c\"; protein_id \"$protein_id\"; \n";
}
}
}
}elsif(exists $mRNA->{$g}->{$m}->{'ncRNA'} ){
my $out_n = $mRNA->{$g}->{$m}->{'ncRNA'};
my $bio_type = $out_n->[4];
print "$out_n->[0]\tGeneBang\ttranscript\t$out_n->[1]\t$out_n->[2]\t.\t$out_n->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$m\"; gi \"$g\"; transcript_biotype \"$bio_type\";\n";
if (exists $exon->{$m}){
foreach my $e (sort keys %{$exon->{$m}}){
my $out_e = $exon->{$m}->{$e};
print "$out_e->[0]\tGeneBang\texon\t$out_e->[1]\t$out_e->[2]\t.\t$out_e->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$m\"; gi \"$g\"; transcript_biotype \"$bio_type\"; exon_id \"$e\"; \n";
}
}
}else{
next;
}
}
}
}
if (defined $gene2tr){
open OUT, ">$outdir/gene2tr.txt" || die $!;
my $gene2tr = &gene2tr("$infile");
foreach my $k (sort keys %{$gene2tr}){
foreach (@{$gene2tr->{$k}}){
print OUT "$k\t$_\n";
}
}
close OUT;
}
sub gff2gtf{
my $gff = shift @_;
my (%gene, %gene_info, %mRNA, %exon, %cds);
open GFF, "< $gff" || die $!;
while(<GFF>){
chomp;
next if /^$|^#/;
my @a = split /\t/, $_;
if ($a[2] =~ /gene/i){
my ($id) = $a[8] =~ /ID=(gene\d+)/;
my ($gene) = $a[8] =~ /Dbxref=GeneID:(\d+)/;
my ($gene_type) = $a[8] =~ /gene_biotype=([^;]+)/;
$gene_info{$gene} = [$a[0], $a[3], $a[4], $a[6], $id, $gene_type];
$gene{$gene} = $id;
}elsif($a[2] =~ /mRNA|lnc_RNA|trascript/i){
my ($rna_id) = $a[8] =~ /ID=(rna\d+)/;
my ($gene) = $a[8] =~ /Dbxref=GeneID:(\d+)/;
my ($transcript) = $a[8] =~ /transcript_id=([^;]+)/;
$mRNA{$gene}{$rna_id}{mRNA} = [$a[0], $a[3], $a[4], $a[6], $transcript];
}elsif($a[2] =~ /exon/){
my ($exon_id) = $a[8] =~ /ID=(id\d+)/;
my ($rna_id) = $a[8] =~ /Parent=([^;]+)/;
$exon{$rna_id}{$exon_id} = [$a[0], $a[3], $a[4], $a[6]];
}elsif($a[2] =~ /CDS/i){
my ($cds_id) = $a[8] =~ /ID=(cds\d+)/;
my ($rna_id) = $a[8] =~ /Parent=([^;]+)/;
my ($protein)= $a[8] =~ /protein_id=([^;]+)/;
$cds{$rna_id}{$cds_id}{pep} = $protein;
push @{$cds{$rna_id}{$cds_id}{pos}},"$a[3]\t$a[4]";
}elsif($a[2] =~ /rRNA/i){
my ($rna_id) = $a[8] =~ /ID=(rna\d+)/;
my ($gene) = $a[8] =~ /Dbxref=GeneID:(\d+)/;
$mRNA{$gene}{$rna_id}{ncRNA} = [$a[0], $a[3], $a[4], $a[6], "rRNA"];
}elsif($a[2] =~ /tRNA/i){
my ($rna_id) = $a[8] =~ /ID=(rna\d+)/;
my ($gene) = $a[8] =~ /Dbxref=GeneID:(\d+)/;
$mRNA{$gene}{$rna_id}{ncRNA} = [$a[0], $a[3], $a[4], $a[6], "tRNA"];
}else{
next;
}
}
return (\%gene, \%gene_info, \%mRNA, \%exon, \%cds);
}
sub gene2tr{
my $gff = shift @_;
my %out;
open GFF, "<$gff" || die $!;
while(<GFF>){
chomp;
next if (/^$|^#/);
my @a = split /\t/, $_;
if ($a[2] =~ /mRNA|transcript|lnc_RNA/i){
my ($gene) = $a[8] =~ /Parent=(gene\d+)/;
my ($transcript) = $a[8] =~ /transcript_id=(\w+_\d+\.\d+)/;
push @{$out{$gene}}, $transcript;
}else{
next;
}
}
close GFF;
return \%out;
}
sub usage{
print STDERR<<EOF;
===========================================================
Name:
$0
Usage:
perl $0 [options]
Options:
-infile* input file [gff/gff3/gtf]
-outdir outdir for ouputs files
-fasta fasta sequence [-ncr/-cds requared]
-gene2tr out gene2tr file
-gff2gtf convert gff/gff3 to gtf format
-ncrna fetch ncRNA tpye [defuall: all, rRNA/tRNA/ncRNA]
-cds fetch cds sequence
-help|? print this help information
e.g:
perl $0 -infile boluo.gff -outdir ./ -gene2tr -gff2gtf
===========================================================
EOF
exit 1;
}
__END__
Author : liupeng@genebang.com
Date : Fri Dec 15 09:55:07 CST 2017