宏基因组分析（一）kneaddata质控及代码分享

病情说明：

虽然我是单细胞师兄，但是也做过宏基因组、代谢组和单细胞转录组的联合分析。文末会提供kneaddata的代码，联系我可以获得docker镜像和分析数据哦！！在此，我先说明一下如果使用你们自己的数据，我们先要把数据整理成这种格式：

那就是需要把fastq文件准备成这个样子（序列id相同，/1 /2用来区分双端测序数据），这个一定要注意！！！不然kneaddata可以正常运行，但是clean data为空。
文件名为sampleid_R1.fastq.gz和sampleid_R2.fastq.gz

image.png

image.png

治疗方案

import time
import threading
import argparse
import os
from pickle import FALSE
import threading
import shutil
import datetime
import json
import time
import subprocess
import pandas as pd

method_path = 'plot_scripts'
refer_path = 'DB'

# 1: check function
def knead_check(data_path: str = None,                ## String: path to offline data
                refer_path: str = None,               ## String: path to database
                species: str = None,                  ## String: species: {human} and {mouse}
                out_path: str = None                  ## String: output path for knead procedure
):
    ## check sampleinfo file
    sampleinfo = f'{data_path}/sample.txt'
    if not os.path.exists(sampleinfo):
        raise SystemExit(' No sampleinfo file ! Please check datasets !')
    else:
        pass

    ## choose species
    kneaddata_database = refer_path + '/kneaddb'
    if species == 'human':
        kneaddb = kneaddata_database + '/human_hg19/hg19'
    elif species == 'mouse':
        kneaddb = kneaddata_database + '/mouse_C57BL/mouse_C57BL_6NJ'
    else:
        pass

    if not os.path.exists(out_path):
        os.makedirs(out_path, exist_ok=True)
        os.system(f'cp {sampleinfo} {out_path}')
    else:
        pass
    ## output
    check_file_path = f'{out_path}/knead_check_file.txt'
    check_file = pd.DataFrame([
        data_path,
        species,
        kneaddb,
        sampleinfo
    ])
    
    check_file.to_csv(check_file_path,
                      header=None, index=None, sep='\t', mode='w')
    return 0
    
class FunctionStatusChecker:
    def __init__(self, func: callable):
        self.func = func
        self.start_time = None
        self.end_time = None
        self.lock = threading.Lock()
    def __call__(self, *args, **kwargs):
        self.start_time = time.time()
        result = self.func(*args, **kwargs)
        self.end_time = time.time()
        return result
    def is_executed(self) -> bool:
        with self.lock:
            return self.start_time is not None and self.end_time is not None

@FunctionStatusChecker
def execCmd(cmd):
    try:
        print("命令{}开始运行{}".format(cmd, datetime.datetime.now()))
        time.sleep(5)
        os.system(cmd)
        print("命令{}结束运行{}".format(cmd, datetime.datetime.now()))
    except:
        print("{}\t运行失败".format(cmd))      
        
@FunctionStatusChecker
def RunKnead(out_path:str = None                  ## String: output path for knead procedure)
              ):
    ## set parameters
    knead_check = pd.read_csv(f'{out_path}/knead_check_file.txt', header=None,delimiter='\t')
    data_path = knead_check.at[0,0]
    species = knead_check.at[1,0]
    kneaddb = knead_check.at[2,0]
    sampleinfo = knead_check.at[3,0]
    
    sample_info_dic = {}
    cmds = []
    for line in open(sampleinfo):
        if not line.startswith("sample"):
            line=line.strip().split()
            sample_info_dic[line[0]]=line[1]
            cmd= f'''/home/soft/miniconda3/envs/Rdoc/bin/kneaddata -t 6 -v \
                -i1 {data_path}/{line[0]}_R1.fastq.gz \
                -i2 {data_path}/{line[0]}_R2.fastq.gz \
                -o  {out_path}/{line[0]} -db {kneaddb} \
                --cat-final-output --output-prefix {line[0]}  --serial \
                --trimmomatic /home/soft/Trimmomatic-0.39/ \
                --trimmomatic-options 'SLIDINGWINDOW:4:20 MINLEN:50' \
                --bowtie2 /home/soft/miniconda3/envs/Rdoc/bin/ \
                --bowtie2-options '--very-sensitive --dovetail' \
                --remove-intermediate-output \
                --fastqc /home/soft/miniconda3/envs/Rdoc/bin/ \
                --trf /home/soft/miniconda3/envs/Rdoc/bin/ \
                --run-fastqc-start \
                --run-fastqc-end '''
        cmds.append(cmd)
        threads = []
    for cmd in cmds:
        th = threading.Thread(target=execCmd, args=(cmd,))
        th.start()
        th.join()
        time.sleep(2)
        threads.append(th)
    return out_path
    
@FunctionStatusChecker
def Run_Stats(out_path:str = None                  ## String: output path for knead procedure
             ):
    StatsDir = out_path + '/readsinfo'
    if os.path.exists(StatsDir):
        pass
    else:
        os.mkdir(StatsDir)
    x= [item for item in os.walk(out_path)]
    for path, dirs, files in x:
        for file in files:
            if file.endswith('.log'):
                shutil.copy(path+os.sep+file,StatsDir)
    cmd = f"cd {out_path} && kneaddata_read_count_table --input  readsinfo/ --output kneaddata_read_count_table.tsv && \
    cut -f 1-5,12-13 kneaddata_read_count_table.tsv | sed 's/_R1_kneaddata//;s/pair//g' > kneaddata_report.txt "
    execCmd(cmd)
    
def knead_call(out_path:str = None                  ## String: output path for knead procedure
        ):
    function_checker = RunKnead
    result = function_checker(out_path) 
    cleandata_path = os.path.abspath(out_path)
    sampleinfo = os.path.abspath(out_path)+'/sample.txt'
    ## output
    call_file_path = f'{out_path}/knead_call_file.txt'
    call_file = pd.DataFrame([cleandata_path,sampleinfo])

    call_file.to_csv(call_file_path,
                      header=None, index=None, sep='\t', mode='w')
    
    step1 = function_checker.is_executed()
    if step1 is True:
        function_checker2 = Run_Stats
        result = function_checker2(out_path) 
        #print(result)  
        step2 = function_checker.is_executed()
    else:
        wait()
    return 0

def knead_plot(out_path:str = None                  ## String: output path for knead procedure
              ):
    cmd = f" Rscript {method_path}/knead_plot.R -w {out_path} "
    execCmd(cmd)
    return 0
    
data_path = './RawData/dre'
species= 'mouse'     
out_path ='kneaddout'
knead_check(data_path,refer_path,species,out_path)
knead_call('kneaddout')
knead_plot('kneaddout')

3. 补充内容

如果需要的话，我可以提供打包好的宏基因组分析docker镜像和测试数据，另外持续关注哦，因为我会持续更新！！！

宏基因组分析（一）kneaddata质控及代码分享

病情说明：

治疗方案

3. 补充内容

推荐阅读更多精彩内容