目的:
- 检查从下机fastq.gz文件的完整性
- 可以只生成MD5文件,但不做检查
- 可以多线程批量解压fastq.gz
- 可以多线程批量压缩fastq
# -*- coding:utf-8 -*-
# Use for MD5check and compress|decompress
# Author:Robin; Created in 20200316
import os
import sys
import time
import getopt
import re
#import filecmp
def parameter_check():
try:
opts, args = getopt.getopt(sys.argv[1:],
'hm:f:t:n:cdps', ['help', 'md5_file=', 'file_type=',
'threads', 'new_md5_file=', 'check', 'decompress', 'press', 'sum'])
if len(opts) == 0:
usage()
sys.exit()
except getopt.GetoptError as err:
# print help information and exit:
print(str(err)) # will print something like "option -a not recognized"
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ['-h', '--help']:
usage()
sys.exit()
elif opt in ['-m', '--md5_file']:
raw_md5 = arg
print('raw_md5:{}'.format(raw_md5))
elif opt in ['-f', '--file_type']:
file_type = arg
print('file_type:{}'.format(file_type))
elif opt in ['-n', '--new_md5_file']:
new_md5_file = arg
print('new_md5_file:{}'.format(new_md5_file))
elif opt in ['-c', '--check']:
print('Running M5Dcheck function')
M5Dcheck(raw_md5, file_type, new_md5_file)
elif opt in ['-s', '--sum']:
MD5sum(file_type, new_md5_file)
elif opt in ['-t', '--threads']:
cores = arg
elif opt in ['-d', '--decompress']:
decompress(cores, file_type)
elif opt in ['-p', '--press']:
compress(cores, file_type)
return(raw_md5, file_type)
def MD5sum(file_type, new_md5_file):
print('正在运行MD5SUM...')
os.system('>CheckMD5.txt')
cmd = 'find *{} -print0| xargs -0 md5sum >>{}'.format(file_type, new_md5_file)
stat = os.system(cmd)
if stat:
print('生成MD5文件有问题,请检查!')
else:
print('MD5文件已生成:./{}'.format(new_md5_file))
def M5Dcheck(raw_md5, file_type, new_md5_file):
MD5sum(file_type, new_md5_file)
raw_dict = dict()
new_dict = dict()
i=j=0
with open(raw_md5, 'r') as f1:
for line in f1:
line = line.strip()
# print(line)
md5, _, check = line.split(' ')
if re.search(r'/', check):
check = check.split('/')[1]
raw_dict[check] = md5
with open(new_md5_file, 'r') as f2:
for line in f2:
line = line.strip()
# print(line)
md5, _, check = line.split(' ')
new_dict[check] = md5
for check in raw_dict.keys():
if new_dict.get(check) != raw_dict[check]:
print("文件{}的MD5值校验失败!".format(check))
i+=1
else:
print('恭喜,文件{}校验成功!\n'.format(check))
j+=1
print('总文件数:{}个\t验证成功:{}个\t验证失败:{}个'.format(i+j, j, i))
def usage():
print('''
Usage:
MD5文件校验:python ParallelMd5DeCompress.py -m <'文件的原本MD5文件'> -f <'*.文件后缀名'> -n <'新生成MD5的文件名'> -c
只生成MD5检验文件:python ParallelMd5DeCompress.py -f <'*.文件后缀名'> -n <'新生成MD5的文件名'> -s
文件解压:python ParallelMd5DeCompress.py -c -f <'*.文件后缀名'> -d
文件压缩:python ParallelMd5DeCompress.py -c -f <'*.文件后缀名'> -p
''')
def decompress(cores, file_type):
#os.system('gunzip 1-Input_sequence_R2.fastq.gz') # 89s
pattern = 'ls {}'.format(file_type)
fq_list = os.popen(pattern)
status = 1
for fq in fq_list:
print(fq)
status = os.system('pigz -dk -p {} {}'.format(cores ,fq))
print(status)
if status:
print('{}文件解压失败,请检查!'.format(fq))
else:
print('{}文件解压成功!'.format(fq))
def compress(cores, file_type):
pattern = 'ls {}'.format(file_type)
fq_list = os.popen(pattern)
status = 1
for fq in fq_list:
print(fq)
status = os.system('pigz -k -p {} {}'.format(cores ,fq))
print(status)
if status:
print('{}文件压缩成功!'.format(fq))
else:
print('{}文件压缩失败,请检查!'.format(fq))
if __name__ == "__main__":
start = time.time()
parameter_check()
end = time.time()
print("用时: {}s".format(end-start))
运行结果展示:
- 测试:
$ls
CheckMD5.txt compress_MD5check.py ParallelMd5DeCompress.py raw.md5 test1.fq.gz test2.fq.gz
$python ParallelMd5DeCompress.py
Usage:
MD5文件校验:python ParallelMd5DeCompress.py -m <'文件的原本MD5文件'> -f <'*.文件后缀名'> -n <'新生成MD5的文件名'> -c
只生成MD5检验文件:python ParallelMd5DeCompress.py -f <'*.文件后缀名'> -n <'新生成MD5的文件名'> -s
文件解压:python ParallelMd5DeCompress.py -c -f <'*.文件后缀名'> -d
文件压缩:python ParallelMd5DeCompress.py -c -f <'*.文件后缀名'> -p
$python ParallelMd5DeCompress.py -m 'raw.md5' -f '*.fq.gz' -n 'new2.md5' -c
raw_md5:raw.md5
file_type:*.fq.gz
new_md5_file:new2.md5
Running M5Dcheck function
正在运行MD5SUM...
MD5文件已生成:./new2.md5
恭喜,文件test1.fq.gz校验成功!
恭喜,文件test2.fq.gz校验成功!
总文件数:2个 验证成功:2个 验证失败:0个
用时: 0.06068730354309082s
2、实际运行:
$jobs
[1]+ 运行中 python /share/nas1/Data/Users/luohb/Pipline/MD5check/test_dir/ParallelMd5DeCompress.py -m 'MD5.txt' -f '*.fq.gz' -n 'review.md5' -c | tee log &