病情说明:
虽然我是单细胞师兄,但是也做过宏基因组、代谢组和单细胞转录组的联合分析。文末会提供kneaddata的代码,联系我可以获得docker镜像和分析数据哦!!在此,我先说明一下如果使用你们自己的数据,我们先要把数据整理成这种格式:
- 那就是需要把fastq文件准备成这个样子(序列id相同,/1 /2用来区分双端测序数据),这个一定要注意!!!不然kneaddata可以正常运行,但是clean data为空。
-
文件名为sampleid_R1.fastq.gz和sampleid_R2.fastq.gz
治疗方案
import time
import threading
import argparse
import os
from pickle import FALSE
import threading
import shutil
import datetime
import json
import time
import subprocess
import pandas as pd
method_path = 'plot_scripts'
refer_path = 'DB'
# 1: check function
def knead_check(data_path: str = None, ## String: path to offline data
refer_path: str = None, ## String: path to database
species: str = None, ## String: species: {human} and {mouse}
out_path: str = None ## String: output path for knead procedure
):
## check sampleinfo file
sampleinfo = f'{data_path}/sample.txt'
if not os.path.exists(sampleinfo):
raise SystemExit(' No sampleinfo file ! Please check datasets !')
else:
pass
## choose species
kneaddata_database = refer_path + '/kneaddb'
if species == 'human':
kneaddb = kneaddata_database + '/human_hg19/hg19'
elif species == 'mouse':
kneaddb = kneaddata_database + '/mouse_C57BL/mouse_C57BL_6NJ'
else:
pass
if not os.path.exists(out_path):
os.makedirs(out_path, exist_ok=True)
os.system(f'cp {sampleinfo} {out_path}')
else:
pass
## output
check_file_path = f'{out_path}/knead_check_file.txt'
check_file = pd.DataFrame([
data_path,
species,
kneaddb,
sampleinfo
])
check_file.to_csv(check_file_path,
header=None, index=None, sep='\t', mode='w')
return 0
class FunctionStatusChecker:
def __init__(self, func: callable):
self.func = func
self.start_time = None
self.end_time = None
self.lock = threading.Lock()
def __call__(self, *args, **kwargs):
self.start_time = time.time()
result = self.func(*args, **kwargs)
self.end_time = time.time()
return result
def is_executed(self) -> bool:
with self.lock:
return self.start_time is not None and self.end_time is not None
@FunctionStatusChecker
def execCmd(cmd):
try:
print("命令{}开始运行{}".format(cmd, datetime.datetime.now()))
time.sleep(5)
os.system(cmd)
print("命令{}结束运行{}".format(cmd, datetime.datetime.now()))
except:
print("{}\t运行失败".format(cmd))
@FunctionStatusChecker
def RunKnead(out_path:str = None ## String: output path for knead procedure)
):
## set parameters
knead_check = pd.read_csv(f'{out_path}/knead_check_file.txt', header=None,delimiter='\t')
data_path = knead_check.at[0,0]
species = knead_check.at[1,0]
kneaddb = knead_check.at[2,0]
sampleinfo = knead_check.at[3,0]
sample_info_dic = {}
cmds = []
for line in open(sampleinfo):
if not line.startswith("sample"):
line=line.strip().split()
sample_info_dic[line[0]]=line[1]
cmd= f'''/home/soft/miniconda3/envs/Rdoc/bin/kneaddata -t 6 -v \
-i1 {data_path}/{line[0]}_R1.fastq.gz \
-i2 {data_path}/{line[0]}_R2.fastq.gz \
-o {out_path}/{line[0]} -db {kneaddb} \
--cat-final-output --output-prefix {line[0]} --serial \
--trimmomatic /home/soft/Trimmomatic-0.39/ \
--trimmomatic-options 'SLIDINGWINDOW:4:20 MINLEN:50' \
--bowtie2 /home/soft/miniconda3/envs/Rdoc/bin/ \
--bowtie2-options '--very-sensitive --dovetail' \
--remove-intermediate-output \
--fastqc /home/soft/miniconda3/envs/Rdoc/bin/ \
--trf /home/soft/miniconda3/envs/Rdoc/bin/ \
--run-fastqc-start \
--run-fastqc-end '''
cmds.append(cmd)
threads = []
for cmd in cmds:
th = threading.Thread(target=execCmd, args=(cmd,))
th.start()
th.join()
time.sleep(2)
threads.append(th)
return out_path
@FunctionStatusChecker
def Run_Stats(out_path:str = None ## String: output path for knead procedure
):
StatsDir = out_path + '/readsinfo'
if os.path.exists(StatsDir):
pass
else:
os.mkdir(StatsDir)
x= [item for item in os.walk(out_path)]
for path, dirs, files in x:
for file in files:
if file.endswith('.log'):
shutil.copy(path+os.sep+file,StatsDir)
cmd = f"cd {out_path} && kneaddata_read_count_table --input readsinfo/ --output kneaddata_read_count_table.tsv && \
cut -f 1-5,12-13 kneaddata_read_count_table.tsv | sed 's/_R1_kneaddata//;s/pair//g' > kneaddata_report.txt "
execCmd(cmd)
def knead_call(out_path:str = None ## String: output path for knead procedure
):
function_checker = RunKnead
result = function_checker(out_path)
cleandata_path = os.path.abspath(out_path)
sampleinfo = os.path.abspath(out_path)+'/sample.txt'
## output
call_file_path = f'{out_path}/knead_call_file.txt'
call_file = pd.DataFrame([cleandata_path,sampleinfo])
call_file.to_csv(call_file_path,
header=None, index=None, sep='\t', mode='w')
step1 = function_checker.is_executed()
if step1 is True:
function_checker2 = Run_Stats
result = function_checker2(out_path)
#print(result)
step2 = function_checker.is_executed()
else:
wait()
return 0
def knead_plot(out_path:str = None ## String: output path for knead procedure
):
cmd = f" Rscript {method_path}/knead_plot.R -w {out_path} "
execCmd(cmd)
return 0
data_path = './RawData/dre'
species= 'mouse'
out_path ='kneaddout'
knead_check(data_path,refer_path,species,out_path)
knead_call('kneaddout')
knead_plot('kneaddout')
3. 补充内容
如果需要的话,我可以提供打包好的宏基因组分析docker镜像和测试数据,另外持续关注哦,因为我会持续更新 !!!