Bracken merge samples

用了类似构造泛基因组表格的方法

1 kraken2bracken分析

kraken2数据处理

# 帮助文档
helpdoc(){
    cat <<EOF
Description:

    This is a help document
    - Plot circos

Usage:

    $0 -i <input file> -o <output file>

Option:

    -i    this is a input file/fold
    -o    this is a output file/fold
EOF
}

# 参数传递
while getopts ":i:o:" opt
do
    case $opt in
        i)
        infile=`echo $OPTARG`
        ;;
        o)
        outfile=`echo $OPTARG`
        ;;
        ?)
        echo "未知参数"
        exit 1;;
    esac
done

# 若无指定任何参数则输出帮助文档
if [ $# = 0 ]
then
    helpdoc
    exit 1
fi

# 核心代码
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate metawrap-env2
db_kraken2="/public/home/zzumgg03/huty/databases/kraken2_soil501/kraken_database"

for i in `ls split_conta/$infile/`; do
    kraken2 --db $db_kraken2 --threads 30 --report 04_bin_501/kraken_out/${i}_kraken.txt --use-names --report-zero-counts --paired split_conta/$infile/$i/${i}_1.fastq split_conta/$infile/$i/${i}_2.fastq
done

bracken数据处理

## bracken
mkdir kraken_bracken
mkdir kraken_bracken/kraken_quant
for i in `ls kraken`; do
    mv kraken/$i/* kraken_bracken/kraken_quant/
    echo -e "$i done..."
done

mkdir bracken_quant
mkdir bracken_report
nohup bash sc_kraken_bracken.sh &

sc_kraken_bracken.sh

# code
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate bracken

for i in `ls kraken_quant/`; do
    base=${i%_kraken.txt}
    bracken \
    -d /public/home/zzumgg03/huty/databases/kraken2_soil501/kraken_database \
    -i kraken_quant/$i \
    -t 10 \
    -o bracken_quant/${base}_bracken.txt \
    -w bracken_report/${base}_bracken.report
done

unclassified reads统计

for i in `ls kraken_quant/ | grep "_kraken.txt"`; do
    base=${i%_kraken.txt}
    num=`cat kraken_quant/$i | head -n 1 | awk '{print $2}'`
    echo -e "$base\t$num" >> kraken_unclassified.txt
done
nohup bash sc_extract_unclassified.sh &

用合并kraken结果表的方法已经不可以了,bracken对物种进行了选择,不再是全部物种。

total reads=unclassified reads + root/bacteria reads统计
bracken重新分配reads后会丢掉一些物种,导致bracken reads减少,total reads也减少。这里在kraken的数据中计算total reads保证其稳定不变。

for i in `ls kraken_quant/ | grep "_kraken.txt"`; do
    base=${i%_kraken.txt}
    num=`cat kraken_quant/$i | head -n 2 | awk '{SUM+=$2}END{print SUM}'`
    echo -e "$base\t$num" >> kraken_unclassified_root.txt
    echo -e "$i done..."
done

2 合并准备

## merge
mkdir bracken_merge
mkdir bracken_merge/bracken_tmp
for i in `ls bracken_quant`; do
    echo -e "$i" >> bracken_merge/sample.list
    cat bracken_quant/$i | sed "1d" | awk -F"\t" 'BEGIN{OFS="\t"}{print $1,$6}' \
    > bracken_merge/bracken_tmp/$i
    cat bracken_quant/$i | sed "1d" | awk -F"\t" 'BEGIN{OFS="\t"}{print $1}' \
    >> bracken_merge/species_raw.list
    echo -e "$i done..."
done

cd bracken_merge/
nohup cat species_raw.list | sort | uniq > species.list &

3 合并

(file) sample.list

DP8400029164BR_L01_61_bracken.txt
DP8400029164BR_L01_64_bracken.txt
DP8400029164BR_L01_68_bracken.txt
DP8400029166BR_L01_63_bracken.txt
DP8400029166BR_L01_65_bracken.txt

(file) species.list

Abiotrophia defectiva
Acanthamoeba polyphaga mimivirus
Acaryochloris marina
Acetilactobacillus jinshanensis
Acetoanaerobium sticklandii

(fold) bracken_tmp/

python脚本 sc_merge.py

#!/usr/bin/env python
import re,sys,os
import pandas as pd
import numpy as np
# 读取文件,去除换行符给新列表
# 读取需要判断PAV的gene list
with open("species.list", 'r') as list_genes:
    list_genes = list_genes.readlines()
    list_genes_enter = []
    for each in list_genes:
        list_genes_enter.append(each.strip())

# 读取每个基因组的gene list
with open("sample.list", 'r') as list_genomes:
    list_genomes = list_genomes.readlines()
    list_genomes_enter = []
    for each in list_genomes:
        list_genomes_enter.append(each.strip())

# 构造数据框
num_row = len(list_genes_enter)
num_col = len(list_genomes_enter)
num_total = num_row * num_col
df = pd.DataFrame(np.zeros(num_total).reshape((num_row, num_col)),
                  columns = list_genomes_enter,
                  index = list_genes_enter)

# 遍历所有基因集,遍历所有行名(基因)是否存在于各基因集(CGR2),重新给表格赋值
route="./bracken_tmp"
for each_genome in list_genomes_enter:
    target_file = "{}/{}".format(route, each_genome)
    # 读取基因集
    with open(target_file, 'r') as target:
        target = target.readlines()
        for each_target in target:
        # 遍历基因集,基因,数字
            each_target = each_target.strip()
            each_target_gene = re.split(r'\t', each_target)[0]
            each_target_num = re.split(r'\t', each_target)[1]
            # 判断行名基因是否在基因集,并给表格元素赋值
            # loc: 字符定位表格元素
            # iloc: 数字定位
            df.loc[each_target_gene, each_genome] = each_target_num
    print("\033[32m {} DONE!\033[0m".format(each_genome))

# 表格保存
df.to_csv('table_merge.txt', sep='\t', index = True)

运行

# zhengzhou super computer
source /public/home/zzumgg03/huty/softwares/miniconda3/etc/profile.d/conda.sh
conda activate py37
nohup python3 sc_merge.py &

4 分类注释

# taxonomy
conda activate py37

python3 ../script/kraken_anno_k.py \
kraken_quant/sample_1/DP8400029164BR_L01_61_kraken.report \
kraken_anno.txt

cat kraken_anno.txt | awk -F"\t" '{if($4=="S") print $6}' \
> kraken_anno_s.txt

kraken_anno_k.py 脚本

#!/usr/bin/env python3
import os, sys, re
ms, infile_name, outfile_name = sys.argv

with open(infile_name, 'r') as infile:
    with open(outfile_name, 'w') as o:
        Domain = ""
        Family = ""
        for line in infile:
            line = line.strip()
            lines = re.split(r'\t', line)
            lines[5] = re.split(r'^[ ]*', lines[5])[1]
            if lines[3] == "D":
                Domain = lines[5]
                o.write("\t".join(lines))
                o.write("\n")
                Kingdom = "NG"  ## Kingdom非必须,Domain必须,在这里赋空值
            elif lines[3] == "K":
                Kingdom = lines[5]
                lines[5] = Domain+";"+Kingdom
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "P":
                Phylum = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "C":
                Class = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "O":
                Order = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "F":
                Family = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "G":
                Genus = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family+";"+Genus
                o.write("\t".join(lines))
                o.write("\n")
            elif lines[3] == "S":
                Species = lines[5]
                lines[5] = Domain+";"+Kingdom+";"+Phylum+";"+Class+";"+Order+";"+Family+";"+Genus+";"+Species
                o.write("\t".join(lines))
                o.write("\n")

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 216,744评论 6 502
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 92,505评论 3 392
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 163,105评论 0 353
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,242评论 1 292
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,269评论 6 389
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,215评论 1 299
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,096评论 3 418
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,939评论 0 274
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,354评论 1 311
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,573评论 2 333
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,745评论 1 348
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,448评论 5 344
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,048评论 3 327
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,683评论 0 22
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,838评论 1 269
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,776评论 2 369
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,652评论 2 354

推荐阅读更多精彩内容