水平基因转移基因鉴定1

Genome screening for candidate horizontal gene transfer (HGT) acquired genes

All protein sequences from B. corticulans' 13 chromosomes were aligned with the NRDB using Diamond (v.2.1.8.162) (Buchfink et al., 2015). The following parameters were applied: E−value = 1e−5, identity > 20%, and max_target_seqs = 1000. We then obtained the taxonomic classification of the hit genes using custom Python scripts based on the NCBI Taxonomy Database. The hits were classified into seven categories according to NCBI taxonomy: bacteria, fungi, archaea, viruses, metazoans, green plants, and other eukaryotic species. Putative donor groups included bacteria (excluding cyanobacteria), fungi, archaea, viruses, and metazoans.

For each B. corticulans gene to be considered a candidate, it had to meet the following conditions:

The top 5 hit species must be from a putative donor group (excluding hits from Bryopsis).
Among the top 50 hits, at least 30 different species must belong to a putative donor group.

The top 50 hit species should overwhelmingly belong to putative donor groups. An index, calculated using the equation below, was used to evaluate the distribution of species within the top 50 hits:

image.png

where n is the number of donor groups, Exp i is the number of species in each group, and Exp max is the maximum number of species among all groups. An index value > 0.8 indicates that most of the top 50 hits belong to a given donor group.

复刻如下，两个脚本分别计算物种蛋白是否来自于hgt和index值。这次只把"Bacteria", "Fungi", "Archaea"三个大类群作为hgt基因的供体，index值约接近1，代表其来自于供体的可能性越大。在本脚本中做了修改，即使前五个不一致，但大多数来自于同一大类群，也认为属于hgt基因。

import os
import pandas as pd
import subprocess

####################################################################
#比对格式如下，需要进行nt-blast和自身blast，输出格式是特殊的-6格式，产出共计14列
#blastn -db /data/share/OriginTools/tools/MetaDB/NCBI_NT_DB/nt    \
#    -query 240408.cds   \
#    -outfmt "6 qseqid staxids bitscore std"    \
#    -max_target_seqs 1000        -evalue 1e-5    \
#    -num_threads 63        -out scap.blastn
#
####################################################################
# 参数设置
blastn_file = "scap.blastn"
taxdump_dir = "./taxdump_db"
output_file = "hgt_candidates_from_nt.csv"
self_taxid = 0  # 你的物种taxid，由于本物种比对未加入taxid，比对taxid都是0
min_donor_class_count = 2

donor_classes = {"Bacteria", "Fungi", "Archaea"}

# 读取blast文件，14列，字段顺序如下
cols = [
    "qseqid_1",   # 1st qseqid，虽然重复，但先读入
    "staxids",    # taxid字段（2nd字段）
    "bitscore",   # bitscore
    "qseqid_2",   # 4th qseqid（重复）
    "sseqid",
    "pident",
    "length",
    "mismatch",
    "gapopen",
    "qstart",
    "qend",
    "sstart",
    "send",
    "evalue",
]

def batch_get_lineages(taxids):
    """
    批量调用taxonkit lineage，返回dict {taxid: [lineage_rank_list]}
    """
    taxid_str = "\n".join(map(str, taxids))
    try:
        result = subprocess.run(
            ["taxonkit", "lineage", "--data-dir", taxdump_dir],
            input=taxid_str,
            capture_output=True,
            text=True,
            check=True,
        )
        lineage_dict = {}
        for line in result.stdout.strip().split("\n"):
            parts = line.split("\t")
            taxid = int(parts[0])
            lineage_str = parts[-1]
            lineage_list = lineage_str.split(";") if lineage_str else []
            lineage_dict[taxid] = lineage_list
        return lineage_dict
    except subprocess.CalledProcessError as e:
        print("Error calling taxonkit lineage:", e)
        return {}

def classify_taxid(taxid, lineage_dict):
    lineage = lineage_dict.get(taxid, [])
    for rank in donor_classes:
        if rank in lineage:
            return rank
    return "Other"

def screen_hgt():
    # 读取blast结果
    blast = pd.read_csv(blastn_file, sep="\t", header=None, names=cols)
    # staxids列转int（有时可能是逗号分隔多个taxid，这里取第一个）
    def parse_taxid(stax):
        if pd.isna(stax):
            return -1
        return int(str(stax).split(";")[0])
    blast["staxids"] = blast["staxids"].apply(parse_taxid)
    blast = blast[blast["staxids"] != -1]

    # 按bitscore降序排序，groupby qseqid_1
    blast = blast.sort_values("bitscore", ascending=False)
    grouped = blast.groupby("qseqid_1")

    # 取所有top5的taxid，去重，批量获取lineage
    all_top5_taxids = set()
    top5_per_query = {}
    for query, group in grouped:
        top5 = group.head(5)
        taxids = top5["staxids"].tolist()
        top5_per_query[query] = taxids
        all_top5_taxids.update(taxids)

    # 批量获取lineage
    lineage_dict = batch_get_lineages(all_top5_taxids)

    hgt_candidates = []
    filtered_self = 0
    filtered_non_donor = 0
    filtered_few_donor = 0
    total = 0

    for query, taxids in top5_per_query.items():
        total += 1
        if self_taxid in taxids:
            filtered_self += 1
            continue

        classes = [classify_taxid(tid, lineage_dict) for tid in taxids]
        donor_taxids = [tid for tid, cls in zip(taxids, classes) if cls in donor_classes]

        # 判断供体taxid是否符合条件
        if not donor_taxids:
            filtered_non_donor += 1
            continue

        # 供体类别数量（不同taxid数目）判断
        if len(set(donor_taxids)) < min_donor_class_count:
            filtered_few_donor += 1
            continue

        hgt_candidates.append(query)

    pd.DataFrame({"qseqid": hgt_candidates}).to_csv(output_file, index=False)

    print(f"✓ 筛选完成，共找到 {len(hgt_candidates)} 个HGT候选基因，输出到 {output_file}")
    print(f"总query数: {total}")
    print(f"被自身taxid过滤: {filtered_self}")
    print(f"被top5非供体过滤: {filtered_non_donor}")
    print(f"被供体种数不足过滤: {filtered_few_donor}")

if __name__ == "__main__":
    screen_hgt()

#index值计算
import pandas as pd
from collections import Counter

input_file = "hgt_candidates_detailed.csv"
output_file = "hgt_candidates_with_index.csv"

donor_classes = {"Bacteria", "Fungi", "Archaea", "Viruses", "Metazoa"}

def calculate_index(donor_counts):
    if not donor_counts:
        return 0.0
    total = sum(donor_counts.values())
    max_count = max(donor_counts.values())
    n = len(donor_counts)
    index = sum(count / max_count for count in donor_counts.values()) / n
    return round(index, 4)

def process():
    df = pd.read_csv(input_file)

    index_list = []
    donor_class_counts = []

    for _, row in df.iterrows():
        # 拆分top5的类群信息
        classes = str(row["top5_classes"]).split(";")
        class_counts = Counter(c for c in classes if c in donor_classes)

        index = calculate_index(class_counts)
        index_list.append(index)

        # 生成 donor_class_counts 字符串形式
        donor_class_counts.append(";".join(f"{k}:{v}" for k, v in class_counts.items()))

    df["donor_class_counts"] = donor_class_counts
    df["index"] = index_list

    df.to_csv(output_file, index=False)
    print(f"✅ index 计算完成，已保存到：{output_file}")

if __name__ == "__main__":
    process()

使用HGTfinder，AVP，preHGT计算水平基因转移，结果差异较大，有时候怀疑这个预测到底准不准。后续还需要建树验证。

水平基因转移基因鉴定1

Genome screening for candidate horizontal gene transfer (HGT) acquired genes

推荐阅读更多精彩内容