1.Bam2fq (by 森哥)
脚本如下:
#!/usr/bin/perl
=head1 Description
ccsbam2fq.pl read BAM file of PacBio CCS and transform into FASTQ file, and output the length, accuracy (rq), and number of passes (np) of each ccs read.
=head1 Author
Version 1.0, Date 2021/07/05.
Sen Wang, wangsen1993@163.com.
=head1 Usage
perl ccsbam2fq.pl -i ccs.reads.bam -o ccs.reads.fq.gz -s ccs.read.fq.gz.stat [-h]
=cut
use strict;
use Getopt::Long;
# parse input options
my ($inbam, $outfq, $outstat, $help);
GetOptions(
"input|i=s"=>\$inbam,
"output|o=s"=>\$outfq,
"stat|s=s"=>\$outstat,
"help|h!"=>\$help
);
if (not $inbam or $help) {
die `pod2text $0`;
}
if (not $outfq) {
$outfq = "$outfq\.fq.gz";
}
if (not $outstat) {
$outstat = "$outfq\.stat";
}
# read BAM file, extract sequence and quality, and write into compressed FASTQ file
open IN, "samtools view $inbam |" or die "Cannot open $inbam!\n";
open OUT, "| gzip > $outfq" or die "Cannot create or write to $outfq!\n";
open STAT, ">$outstat" or die "Cannot create or write to $outstat!\n";
print STAT "#Read_id\tLength\tRead_accuracy\tNumber_passes\n";
while (<IN>) {
next if /^@/;
my @t = split(/\t/, $_);
my $rd = $t[0]; # 1st column is read id
my $sq = $t[9]; # 10th column is read sequence
my $ql = $t[10]; # 11th column is read quality ASCII (Phred +33)
my ($np, $rq);
foreach my $t (@t[11 .. @t - 1]) {
if ($t =~ /np/) {
$np = $t; # the column of number of passes (subreads covering each ccs read)
} elsif ($t =~ /rq/) {
$rq = $t; # the column of read quality (average accuracy)
} else {
next;
}
}
print OUT "\@$rd\t$rq\t$np\n$sq\n\+\n$ql\n";
my $ln = length($sq);
$np =~ s/np:i://;
$rq =~ s/rq:f://;
print STAT "$rd\t$ln\t$rq\t$np\n";
}
close IN;
close OUT;
close STAT;
输出结果为:fq.gz 和 fq.gz.stat (记录每条reads信息)
2.fq2fasta
zcat hifi.fq.gz | awk 'NR%4==1{printf ">%s\n", substr($0, 2)} NR%4==2{print}' | cut -f 1|gzip > hifi.fasta.gz
(首先使用zcat解压缩hifi.fq.gz文件,然后使用awk进行处理。awk命令中的代码会检查每行的行号,如果行号除以4的余数为1,则表示这是序列名称行;如果余数为2,则表示这是序列行。对于序列名称行,它会在行的开头添加>符号并输出;对于序列行,它会直接输出。最后,使用gzip将输出结果压缩为hifi.fasta.gz文件。)
3.统计序列信息
# 统计UltraLong.fa.gz中每条read的长度
seqtk comp hifi.fasta.gz | awk '{print $1"\t"$2}'
统计序列具体信息(总reads数,总碱基数,最短read长度,最长read长度,N50值)
#!/usr/bin/perl
###perl count_reads.pl UltraLong.fa.gz
use strict;
use warnings;
# 从命令行参数获取输入文件名
my $input_file = shift @ARGV;
# 检查输入文件是否存在
die "请输入有效的输入文件名。\n" unless defined $input_file && -e $input_file;
# 打开文件
open my $fh, "gzip -dc $input_file |" or die "无法打开文件: $input_file: $!";
my $total_reads = 0;
my $total_bases = 0;
my @read_lengths;
# 逐行读取文件
while (my $line = <$fh>) {
chomp $line;
if ($line =~ /^>/) {
$total_reads++;
} else {
my $length = length($line);
$total_bases += $length;
push @read_lengths, $length;
}
}
# 关闭文件
close $fh;
# 计算统计信息
my $min_length = min(@read_lengths);
my $max_length = max(@read_lengths);
my $n50 = calculate_n50(@read_lengths);
# 输出统计结果
print "总reads数: $total_reads\n";
print "总碱基数: $total_bases\n";
print "最短read长度: $min_length\n";
print "最长read长度: $max_length\n";
print "N50值: $n50\n";
# 计算N50值
sub calculate_n50 {
my @lengths = sort { $b <=> $a } @_;
my $total_length = sum(@lengths);
my $half_length = $total_length / 2;
my $n50 = 0;
my $running_length = 0;
foreach my $length (@lengths) {
$running_length += $length;
if ($running_length >= $half_length) {
$n50 = $length;
last;
}
}
return $n50;
}
# 计算数组最小值
sub min {
my $min = shift;
foreach my $value (@_) {
$min = $value if $value < $min;
}
return $min;
}
# 计算数组最大值
sub max {
my $max = shift;
foreach my $value (@_) {
$max = $value if $value > $max;
}
return $max;
}
# 计算数组总和
sub sum {
my $sum = 0;
foreach my $value (@_) {
$sum += $value;
}
return $sum;
}
4.提取大于50 Kb的UltraLong read用于基因组组装。
seqtk seq -L 50000 hifi.fq.gz| awk 'BEGIN{RS="@"} length($0)>20000 {printf "@"$0}'|gzip > 50Kb.hifi.fq.gz