话不多说,直接上代码
import sys
from Bio import SeqIO
import gzip
fq1,fq2 = sys.argv[1:] #输入文件
### 判断输入文件为fastq还是fastq.gz
def trans2fastq(fin) :
fout = fin.replace('.gz','')
g = gzip.GzipFile(mode='rb', fileobj=open(fin,'rb'))
open(fout,'wb').write(g.read())
return(fout)
### 将fastq.gz 转为fastq
if '.gz' in fq1 : fq1 = trans2fastq(fq1)
if '.gz' in fq2 : fq2 = trans2fastq(fq2)
#### 计算
for f in [fq1, fq2] :
reads_count = 0
gc_base = 0
all_base = 0
all_qual = 0
q20_qual = 0
q30_qual = 0
record = SeqIO.parse(f,'fastq')
for lane in record :
## 序列数
reads_count+=1
### 碱基统计
gc_base+=len([i for i in lane.seq if i=='G' or i=='C'])
all_base+=(len(lane.seq))
### 质量值统计
qual = lane.letter_annotations['phred_quality']
all_qual+=(len(qual))
q20_qual+=len([q for q in qual if q >=20])
q30_qual+=len([q for q in qual if q >=30])
### 计算比例
q20 = round(q20_qual/all_qual,4) * 100
q30 = round(q30_qual/all_qual,4) * 100
gc = round(gc_base/all_base,4) * 100
print(reads_count,all_base,gc,q20,q30)