病情说明:
虽然我是单细胞师兄,但是也做过宏基因组、代谢组和单细胞转录组的联合分析。文末会提供kneaddata的代码,联系我可以获得docker镜像和分析数据哦!!在此,我先说明一下如果使用你们自己的数据,我们先要把数据整理成这种格式:
- 那就是需要把fastq文件准备成这个样子(序列id相同,/1 /2用来区分双端测序数据),这个一定要注意!!!不然kneaddata可以正常运行,但是clean data为空。
-
文件名为sampleid_R1.fastq.gz和sampleid_R2.fastq.gz
image.png
image.png
治疗方案
import time
import threading
import argparse
import os
from pickle import FALSE
import threading
import shutil
import datetime
import json
import time
import subprocess
import pandas as pd
method_path = 'plot_scripts'
refer_path = 'DB'
# 1: check function
def knead_check(data_path: str = None, ## String: path to offline data
refer_path: str = None, ## String: path to database
species: str = None, ## String: species: {human} and {mouse}
out_path: str = None ## String: output path for knead procedure
):
## check sampleinfo file
sampleinfo = f'{data_path}/sample.txt'
if not os.path.exists(sampleinfo):
raise SystemExit(' No sampleinfo file ! Please check datasets !')
else:
pass
## choose species
kneaddata_database = refer_path + '/kneaddb'
if species == 'human':
kneaddb = kneaddata_database + '/human_hg19/hg19'
elif species == 'mouse':
kneaddb = kneaddata_database + '/mouse_C57BL/mouse_C57BL_6NJ'
else:
pass
if not os.path.exists(out_path):
os.makedirs(out_path, exist_ok=True)
os.system(f'cp {sampleinfo} {out_path}')
else:
pass
## output
check_file_path = f'{out_path}/knead_check_file.txt'
check_file = pd.DataFrame([
data_path,
species,
kneaddb,
sampleinfo
])
check_file.to_csv(check_file_path,
header=None, index=None, sep='\t', mode='w')
return 0
class FunctionStatusChecker:
def __init__(self, func: callable):
self.func = func
self.start_time = None
self.end_time = None
self.lock = threading.Lock()
def __call__(self, *args, **kwargs):
self.start_time = time.time()
result = self.func(*args, **kwargs)
self.end_time = time.time()
return result
def is_executed(self) -> bool:
with self.lock:
return self.start_time is not None and self.end_time is not None
@FunctionStatusChecker
def execCmd(cmd):
try:
print("命令{}开始运行{}".format(cmd, datetime.datetime.now()))
time.sleep(5)
os.system(cmd)
print("命令{}结束运行{}".format(cmd, datetime.datetime.now()))
except:
print("{}\t运行失败".format(cmd))
@FunctionStatusChecker
def RunKnead(out_path:str = None ## String: output path for knead procedure)
):
## set parameters
knead_check = pd.read_csv(f'{out_path}/knead_check_file.txt', header=None,delimiter='\t')
data_path = knead_check.at[0,0]
species = knead_check.at[1,0]
kneaddb = knead_check.at[2,0]
sampleinfo = knead_check.at[3,0]
sample_info_dic = {}
cmds = []
for line in open(sampleinfo):
if not line.startswith("sample"):
line=line.strip().split()
sample_info_dic[line[0]]=line[1]
cmd= f'''/home/soft/miniconda3/envs/Rdoc/bin/kneaddata -t 6 -v \
-i1 {data_path}/{line[0]}_R1.fastq.gz \
-i2 {data_path}/{line[0]}_R2.fastq.gz \
-o {out_path}/{line[0]} -db {kneaddb} \
--cat-final-output --output-prefix {line[0]} --serial \
--trimmomatic /home/soft/Trimmomatic-0.39/ \
--trimmomatic-options 'SLIDINGWINDOW:4:20 MINLEN:50' \
--bowtie2 /home/soft/miniconda3/envs/Rdoc/bin/ \
--bowtie2-options '--very-sensitive --dovetail' \
--remove-intermediate-output \
--fastqc /home/soft/miniconda3/envs/Rdoc/bin/ \
--trf /home/soft/miniconda3/envs/Rdoc/bin/ \
--run-fastqc-start \
--run-fastqc-end '''
cmds.append(cmd)
threads = []
for cmd in cmds:
th = threading.Thread(target=execCmd, args=(cmd,))
th.start()
th.join()
time.sleep(2)
threads.append(th)
return out_path
@FunctionStatusChecker
def Run_Stats(out_path:str = None ## String: output path for knead procedure
):
StatsDir = out_path + '/readsinfo'
if os.path.exists(StatsDir):
pass
else:
os.mkdir(StatsDir)
x= [item for item in os.walk(out_path)]
for path, dirs, files in x:
for file in files:
if file.endswith('.log'):
shutil.copy(path+os.sep+file,StatsDir)
cmd = f"cd {out_path} && kneaddata_read_count_table --input readsinfo/ --output kneaddata_read_count_table.tsv && \
cut -f 1-5,12-13 kneaddata_read_count_table.tsv | sed 's/_R1_kneaddata//;s/pair//g' > kneaddata_report.txt "
execCmd(cmd)
def knead_call(out_path:str = None ## String: output path for knead procedure
):
function_checker = RunKnead
result = function_checker(out_path)
cleandata_path = os.path.abspath(out_path)
sampleinfo = os.path.abspath(out_path)+'/sample.txt'
## output
call_file_path = f'{out_path}/knead_call_file.txt'
call_file = pd.DataFrame([cleandata_path,sampleinfo])
call_file.to_csv(call_file_path,
header=None, index=None, sep='\t', mode='w')
step1 = function_checker.is_executed()
if step1 is True:
function_checker2 = Run_Stats
result = function_checker2(out_path)
#print(result)
step2 = function_checker.is_executed()
else:
wait()
return 0
def knead_plot(out_path:str = None ## String: output path for knead procedure
):
cmd = f" Rscript {method_path}/knead_plot.R -w {out_path} "
execCmd(cmd)
return 0
data_path = './RawData/dre'
species= 'mouse'
out_path ='kneaddout'
knead_check(data_path,refer_path,species,out_path)
knead_call('kneaddout')
knead_plot('kneaddout')
3. 补充内容
如果需要的话,我可以提供打包好的宏基因组分析docker镜像和测试数据,另外持续关注哦,因为我会持续更新 !!!