生信虐我系列之--几个必备小程序:
这个程序可以根据提供的 SNP位点信息,以及参考注释信息对 SNP 位点突变前后的 氨基酸 进行比较。
** 修改了部分代码,重新设计了 SNP 突变位点落在反向转录本上的情况**
** 重新设计了结果的输出格式**
** 欢迎积极提出设计上的不足**
输入文件格式:
1. 参考注释的 GTF 文件;
2. 参考注释的基因组 fasta 文件;
3. 输入的 SNP 列表文件:
1 49527 A G Exon:Zm00001d027230;
1 51053 A T Exon:Zm00001d027231;
1 53013 A G Exon:Zm00001d027231;
1 53109 C T Exon:Zm00001d027231;
第一列: 染色体
第二列:突变位置
第三列:SNP 参考的碱基
第四列:SNP 位点突变后的碱基
第五列:[可选],SNP 的信息
使用方法:
perl script.pl zma.gtf zma.fa in.txt >result.txt
程序源码如下:(遵循 GPL v3 Licenses)
#!/usr/bin/perl -w
use strict;
die "Usage: perl $0 <zma.GTF> <zma.FA> <in.TXT> >Result.txt \n" if @ARGV < 3;
&timing("start")
my (%tr, %cds, %seq);
open IN, $ARGV[0] or die $!;
while (<IN>){
chomp;
next if /^#/;
my @a = split /\t/,$_;
if ($a[2] eq "transcript"){
# (my $gene) = $a[8] =~ /gene_id\s"(\S+)";/;
(my $id) = $a[8] =~ /transcript_id\s"(\S+)";/;
# $tr{$a[0]}{$gene}{$id} = [@a[3,4,6]];
$tr{$a[0]}{$id} = [@a[3,4,6]];
}
if ($a[2] eq "CDS"){
# (my $gene) = $a[8] =~ /gene_id\s"(\S+)";/;
(my $id) = $a[8] =~ /transcript_id\s"(\S+)";/;
next unless exists $tr{$a[0]}{$id};
push @{$cds{$a[0]}{$id}},[@a[3,4,7]];
}
}
close IN;
# make fasta hash
open FA,$ARGV[1] or die $!;
$/ = ">"; <FA>;
while(<FA>){
chomp;
my ($info,$fa) = split /\n/,$_,2;
my ($chr,$tmp) = split /\s/,$info,2;
$fa =~ s/\n+//g;
$seq{$chr} = $fa;
}
$/ = "\n";
close FA;
# deal with input
open IN,$ARGV[2] or die $!;
print "# Chrom:Loci:Ref->Alt\tTranscript\tStrand\tCDS(Ref->Alt)\tRef_Codon\tAlt_Codon\tRef_aa\tAlt_aa\tAlt_Pos\tCDS_Pos\n";
while (<IN>){
chomp;
my ($chr,$loci,$ref,$alt,$tmp) = split /\t/,$_;
my ($strand,$j,$tr_id);
my @arr;
my $flag = 0;
if ( exists $tr{$chr} ){
foreach my $k (sort keys $tr{$chr}){
if ($loci >= $tr{$chr}{$k}->[0] && $loci <= $tr{$chr}{$k}->[1]){
$flag = 1;
$strand = $tr{$chr}{$k}->[2];
@arr = sort{$a->[0] <=> $b->[0] || $a->[1] <=> $b->[1]} @{$cds{$chr}{$k}};
for (my $i=0;$i<@arr;$i++){
if ($loci >= $arr[$i]->[0] && $loci <= $arr[$i]->[1]){
$flag = 2;
$j = $i;
$tr_id = $k;
last;
}
}
}
}
}
if ($flag == 2 ){
my ($pos, $seq, $site, $ref_codon, $alt_codon, $ref_aa, $alt_aa);
my ($ref_nucl, $alt_nucl, $alt_pos);
my $c = &code();
if ($strand eq "+"){
foreach (0..($j-1)){
$pos += $arr[$_]->[1] - $arr[$_]->[0] + 1;
}
$pos += $loci - $arr[$j]->[0] + 1;
foreach my $v (@arr){
my ($x,$y) = @$v[0,1];
$seq .= uc(substr($seq{$chr},$x-1,$y-$x+1));
}
$ref_nucl = substr($seq,$pos-1,1);
$alt_nucl = $alt;
$site = $pos % 3 ;
if($site == 1){
$ref_codon = uc(substr($seq,$pos - 1,3));
$alt_codon = &ref2alt($ref_codon,1,$alt_nucl);
$alt_pos = 1;
}elsif($site == 2){
$ref_codon = uc(substr($seq,$pos - 2,3));
$alt_codon = &ref2alt($ref_codon,2,$alt_nucl);
$alt_pos = 2;
}elsif($site == 0){
$ref_codon = uc(substr($seq,$pos - 3,3));
$alt_codon = &ref2alt($ref_codon,3,$alt_nucl);
$alt_pos = 3;
}
}
if ($strand eq "-"){
my $pos1;
foreach (0..($j-1)){
$pos1 += $arr[$_]->[1] - $arr[$_]->[0] + 1;
}
$pos1 += $loci - $arr[$j]->[0] + 1;
foreach my $v (@arr){
my ($x,$y) = @$v[0,1];
$seq .= uc(substr($seq{$chr},$x-1,$y-$x+1));
}
my $len = length $seq;
$pos = $len - $pos1 + 1;
$seq = reverse $seq;
$seq =~ tr/AaGgCcTt/TtCcGgAa/;
$ref_nucl = substr($seq,$pos-1,1);
$alt_nucl = $alt;
$alt_nucl =~ tr/AaGgCcTt/TtCcGgAa/;
$site = $pos % 3;
if($site == 1){
$ref_codon = uc(substr($seq,$pos - 1,3));
$alt_codon = &ref2alt($ref_codon,1,$alt_nucl);
$alt_pos = 1;
}elsif($site == 2){
$ref_codon = uc(substr($seq,$pos - 2,3));
$alt_codon = &ref2alt($ref_codon,2,$alt_nucl);
$alt_pos = 2;
}elsif($site == 0){
$ref_codon = uc(substr($seq,$pos - 3,3));
$alt_codon = &ref2alt($ref_codon,3,$alt_nucl);
$alt_pos = 3;
}
}
$ref_aa = exists $c->{"standard"}{$ref_codon} ? $c->{"standard"}{$ref_codon} : "-";
$alt_aa = exists $c->{"standard"}{$alt_codon} ? $c->{"standard"}{$alt_codon} : "-";
print "$chr:$loci:$ref->$alt\t$tr_id\t$strand\t$ref_nucl->$alt_nucl\t$ref_codon\t$alt_codon\t$ref_aa\t$alt_aa\t$alt_pos\tcds_$pos\n";
}
}
close IN;
&timing("end");
# time
sub timing{
my $state=shift;
my $time=strftime("%Y-%m-%d.%H:%M:%S",localtime);
print STDOUT "## ==== >> Start at $time\n" if $state eq "start";
print STDOUT "## ==== >> End at $time\n" if $state eq "end";
}
# ref2alt
sub ref2alt{
my ($codon, $pos, $alt) = @_;
my ($c1, $c2, $c3) = split //,$codon;
my $out;
if ($pos == 1){
$out = join ("",$alt,$c2,$c3);
}elsif ($pos == 2){
$out = join ("",$c1,$alt,$c3);
}else{
$out = join ("",$c1,$c2,$alt);
}
return $out;
}
# code
sub code{
my $p = {
"standard" =>
{
'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A', 'GCT' => 'A', # Alanine
'TGC' => 'C', 'TGT' => 'C', # Cysteine
'GAC' => 'D', 'GAT' => 'D', # Aspartic Aci
'GAA' => 'E', 'GAG' => 'E', # Glutamic Aci
'TTC' => 'F', 'TTT' => 'F', # Phenylalanin
'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G', # Glycine
'CAC' => 'H', 'CAT' => 'H', # Histidine
'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I', # Isoleucine
'AAA' => 'K', 'AAG' => 'K', # Lysine
'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L', 'CTT' => 'L', 'TTA' => 'L', 'TTG' => 'L', # Leucine
'ATG' => 'M', # Methionine
'AAC' => 'N', 'AAT' => 'N', # Asparagine
'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P', # Proline
'CAA' => 'Q', 'CAG' => 'Q', # Glutamine
'CGA' => 'R', 'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'AGA' => 'R', 'AGG' => 'R', # Arginine
'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S', 'AGC' => 'S', 'AGT' => 'S', # Serine
'ACA' => 'T', 'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T', # Threonine
'GTA' => 'V', 'GTC' => 'V', 'GTG' => 'V', 'GTT' => 'V', # Valine
'TGG' => 'W', # Tryptophan
'TAC' => 'Y', 'TAT' => 'Y', # Tyrosine
'TAA' => 'U', 'TAG' => 'U', 'TGA' => 'U' # Stop
}
}
}
__END__
1 49527 A G Exon:Zm00001d027230;
1 51053 A T Exon:Zm00001d027231;
1 53013 A G Exon:Zm00001d027231;
1 53109 C T Exon:Zm00001d027231;
1 53826 T C Exon:Zm00001d027231;
1 53885 T C Exon:Zm00001d027231;
本文原创,转载请注明出处!
欢迎留言讨论!