导读
CheckM是用于评估分离出的微生物、单细胞和宏基因组的常用质量工具。其使用有谱系世系关系的特有和独有基因数据集来大致估计基因组的完整度和污染程度。N50、N90、GC含量、Bin size等基因组de novo组装分析常用的参数可以通过Python、Perl脚本计算。
一、计算完成度、污染度
CheckM的代码全部托管在github上。
官方主页:https://ecogenomics.github.io/CheckM/
下载地址:https://github.com/Ecogenomics/CheckM
说明文档:https://github.com/Ecogenomics/CheckM/wiki
# 质检:CheckM
time checkm lineage_wf -f Bin_all/Bin_quality/checkm.txt -t $thread -x fa Bin_all/Bin/ Bin_all/Bin_quality/
# 整理:shell单行命令
grep 'bin' Bin_all/Bin_quality/checkm.txt | sed 's/^ //' | awk '{print $1,$13,$14}' | sed 's/\ /\t/g'| sed 's/\./\t/' | sort -n -k 2 | sed 's/\t/./' | sort -k 2 -n -r | sed '1i\BinID\tCompleteness\tContamination' > Bin_all/Bin_quality/checkm_raw.txt
# 筛选:shell单行命令
grep 'bin' Bin_all/Bin_quality/checkm.txt | sed 's/^ //' | awk '{print $1,$13,$14}' | sed 's/\ /\t/g'| sed 's/\./\t/' | sort -n -k 2 | sed 's/\t/./' | awk '{if($2>=70 && $3<=10) print $0}' | sort -k 2 -n -r | sed '1i\BinID\tCompleteness\tContamination' > Bin_all/Bin_quality/checkm_pick.txt
# 筛选ID:shell单行命令
awk '{print $1}' Bin_all/Bin_quality/checkm_pick.txt | sed '1d' > Bin_all/Bin_quality/checkm_pick_id.txt
# 移动筛选Bin:shell命令
for i in `cat Bin_all/Bin_quality/checkm_pick_id.txt`; do
mv Bin_all/Bin/$i.fa Bin_all/Bin_pick/
done
二、计算N50、N90
Perl脚本:N50.N90.pl
#!/usr/bin/perl -w
my ($len,$total)=(0,0);my @x;while(<>){if(/^[\>\@]/){if($len>0){$total+=$len;push@x,$len;};$len=0;}else{s/\s//g;$len+=length($_);}}if ($len>0){$total+=$len;push @x,$len;}@x=sort{$b<=>$a}@x; my ($count,$half)=(0,0);for (my $j=0;$j<@x;$j++){$count+=$x[$j];if(($count>=$total/2)&&($half==0)){print "N50: $x[$j]\n";$half=$x[$j]}elsif($count>=$total*0.9){print "N90: $x[$j]\n";exit;}}
参考:【Panda姐-perl练习题13】计算N50,N90等
R脚本:N50.N90.sort.R
#!/usr/bin/env Rscript
setwd("Bin_all/Bin_summary/")
data=read.table("N50.N90.txt", header=F)
BinID=vector()
N50=vector()
N90=vector()
index=seq(from=1, to=length(data[, 1]), by=3)
a=1
for(i in index)
{
BinID[a]=as.character(data[i, "V1"])
N50[a]=as.character(data[i+1, "V1"])
N90[a]=as.character(data[i+2, "V1"])
a=a+1
}
N50_N90_out=data.frame(BinID, N50, N90)
write.table(N50_N90_out, file="N50.N90.out.txt", sep="\t", quote=F, row.names=F)
批处理
#批处理:计算N50和N90
for i in Bin_all/Bin_pick/*.fa; do
base=${i##*/}
name=${base%.fa}
echo $name >> Bin_all/Bin_summary/N50.N90.txt
perl /home/cheng/huty/softwares/script_bin/N50.N90.pl $i >> Bin_all/Bin_summary/N50.N90.txt
done
# 去掉结果中的N50、N90
sed -i 's/N50: //g' Bin_all/Bin_summary/N50.N90.txt
sed -i 's/N90: //g' Bin_all/Bin_summary/N50.N90.txt
# 排序N50和N90
R CMD BATCH --args /home/cheng/huty/softwares/script_bin/N50.N90.sort.R
三、计算GC含量、Size
Python3脚本:stat_bin_gc.py
#!/usr/bin/env python3
import os
import sys
import re
sp, indir, outfile = sys.argv
with open(outfile, 'w') as outfile:
outfile.write("Bins\tSize\tGC\n")
bins = os.listdir(indir)
for bi in bins:
bi_path = "{}/{}".format(indir, bi)
if os.path.isfile(bi_path):
with open(bi_path) as bif:
seq = bif.read()
seq = re.sub('^>.*$', '', seq)
seq = ''.join(seq.split())
# print(seq)
nG = seq.count('G')
nC = seq.count('C')
total = len(seq)
p_GC = (nC + nG) / total
bin_n = re.sub('\.fa$', '', bi)
outfile.write("{}\t{}\t{}\n".format(bin_n, total, p_GC))
批处理
# 统计Size和GC含量
stat_bin_gc.py Bin_all/Bin_pick/ Bin_all/Bin_summary/bin.gc.txt
sed '1d' Bin_all/Bin_summary/bin.gc.txt | sed '1 iBinID\tSize\tGC_percent' > Bin_all/Bin_summary/bin.gc_out.txt
四、整理结果
R脚本:N50.N90.GC.size.R
#!/usr/bin/env Rscript
setwd("Bin_all/Bin_summary")
checkm=read.table("../Bin_quality/checkm_pick.txt", header=T, sep="\t")
gc=read.table("bin.gc_out.txt", header=T, sep="\t")
n50n90=read.table("N50.N90.out.txt", header=T, sep="\t")
merge=merge(merge(checkm, n50n90, by="BinID"), gc, by="BinID")
write.table(merge, file="bin_summary.txt", row.names=F, quote=F, sep="\t")
批处理
# 合并checkm n50 n90 size gc%
R CMD BATCH --args /home/cheng/huty/softwares/script_bin/N50.N90.GC.size.R