#!/usr/bin/perl -w
use strict;
use Data::Dumper;
my $file='genome.fasta.gz';
open A,"gunzip -dc $file|" or die;
#$/=">";<A>;
my $num=0;
my %hash;
my $name;
while(<A>){
chomp;
if (/>(.+)/){
$num ++;
$name=$1;
}else{
tr/atcg/ATCG/;
$hash{$name}.= $_;
}
}
#print Dumper(\%hash);
close A;
my $len; my $all_len;
my $countC;my $countG;my $countCpG;
my %hash_len;
foreach my $key (sort keys %hash){
$len=length $hash{$key};
$hash_len{$key}=$len;
$all_len += $len;
$countC +=$hash{$key}=~s/C/C/g;
$countG +=$hash{$key}=~s/G/G/g;
$countCpG +=$hash{$key}=~s/CG/CG/g;
}
my $half_all_len=$all_len/2;
my $N50=0;
my $in_len;
for my $len_num (sort{$b <=> $a} values %hash_len ){
$in_len += $len_num;
if ($in_len >= $half_all_len){
$N50 = $len_num;
last;
}
}
my $countCG = sprintf"%0.3f",($countC+$countG)/$all_len;
print "序列条数:$num\n总长:$all_len\n";
print "N50:$N50\n";
print "GC含量:$countCG\nCpG位点数:$countCpG\n";
输出结果:
序列条数:942
总长:95269455
N50:192515
GC含量:0.46
CpG位点数:1000
2.只计算每条的长度
#!/usr/bin/perl -w
use strict;
use Data::Dumper;
my $file=shift;
open A,$file or die;
$/=">";<A>;
while(<A>){
my $id=$1 if(/^(\S+)/);
$/=">";
chomp;
tr/atcg/ATCG/;
s/^.+?\n//;
s/\s//g;
my $a = length $_;
print "$id\t$a\n";
}
$/="\n";
close A;
结果
>111 200
>2222 40
或者是这样:
#!/usr/bin/perl -w
use strict;
use Data::Dumper;
my $file=shift;
open IN,$file or die;
$/='>';<IN>;$/="\n";
while(<IN>){
my $id=$1 if(/^(\S+)/);
$/='>';
my $seq=<IN>;
chomp($seq);
$seq=~s/\s+//g;
$seq=~tr/atcg/ATCG/;
my $length=length($seq);
$/="\n";
print "$length\n";
}
close IN;