重新进行配对方案的评估(参数计算全部流程-以78只个体的基因组vcf.gz文件和方案制定思路探讨)

目标:这篇文章将重新进行配对方案的评估,记录详细的数据处理过程,计算参数包括Het、亲缘系数R、Froh、配对ROH消除概率、遗传负荷、配对实现负荷转换为潜在负荷概率。利用这些参数,再探讨设计一个完整的配对方案排名体系。(利用SCT_filter78.call.snp.filtered.2allelic.DP.vcf.gz.vcf.gz文件展开分析)
1.创建工作文件夹
mkdir -p {01.plink,02.Het,03.Kinship,04.ROH,05.Pair_ROH,06.Load,07.Pair_Load}
2.vcf文件格式转换
plink --vcf /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/03.chr_combine/SCT_filter78.call.snp.filtered.2allelic.DP.vcf.gz.vcf.gz --make-bed --out SCT_raw

plink --bfile SCT_raw --recode --out SCT_raw
3.查看样本缺失率和snp位点缺失率
### 通常流程是先过滤掉缺失率较高的个体,再过滤掉缺失率较高的位点
plink --bfile SCT_raw --mind 0.02 --make-bed --out SCT_ind
plink --bfile SCT_ind --recode --out SCT_ind
plink --bfile SCT_ind --geno 0.02 --make-bed --out SCT_SNP
plink --bfile SCT_SNP --recode --out SCT_SNP
# 过滤掉了个体如下
#HNH_606 AVE_Depth 18.7
#HNH_650 AVE_Depth 16.39
#HNH_735 AVE_Depth 13.53
#HNH_809 AVE_Depth 15.43
#过滤掉的位点数量:428799
4.进行MAF过滤
### MAF频率柱状图
plink --bfile SCT1_SNP_ind0.02 --freq --out MAF_check
# 过滤
plink --bfile SCT_SNP --maf 0.05 --make-bed --out SCT_MAF
plink --bfile SCT_MAF --recode --out SCT_MAF
#过滤完成后,剩余位点数为3835315
#这时我们应该思考一个问题,我们过滤掉了如此多的位点,这些过滤掉的位点对后期分析是否也是关键的问题
5.进行连锁不平衡过滤
awk '{print $1 ,"\t", $1"_"$4, "\t", $3 , "\t", $4 , "\t", $5 ,"\t", $6}' SCT_MAF.bim > SCT_MAF1.bim
mv SCT_MAF1.bim SCT_MAF.bim
awk '{print $1"\t"$1"_"$4"\t"$3"\t"$4}' SCT_MAF.map > SCT_MAF1.map
mv SCT_MAF1.map SCT_MAF.map
plink --file SCT_MAF --indep-pairwise 50kb 1 0.9 --out SCT_LD0.9
plink --bfile SCT_MAF --extract SCT_LD0.9.prune.in --make-bed --out SCT_LD0.9
plink --bfile SCT_LD0.9 --recode --out SCT_LD0.9
#进行LD过滤丢失了太多位点,现在只剩369215个位点
6.进行hwe过滤
#由于进行LD过滤,过滤掉了太多位点,所以先不利用SCT_LD0.9文件进行后续处理,而是利用SCT_MAF进行后续过滤
plink --bfile SCT_MAF --hwe 0.001 --make-bed --out SCT_hwe
plink --bfile SCT_hwe --recode --out SCT_hwe
7.计算Het,计算平均Het
plink --bfile ../01.plink/SCT_hwe --export vcf --out SCT --allow-extra-chr
vcftools --vcf SCT.vcf --het --out SCT
sed '1d' SCT.het | awk '{print $1,"\t",$2,"\t",$4,"\t",($4-$2)/2258664721}' > SCT_het.txt
## sample.id文件中包含你想找到了所有个体的id名
grep -Ff sample.id SCT_het.txt | awk '{print $1, $4}' > het.txt
### 计算拟配对个体与其他个体的平均杂合度脚本
import pandas as pd

# 读取 SCT_het.txt 文件,第一列是样品ID,第四列是杂合度
file_path = 'SCT_het.txt'
data = pd.read_csv(file_path, delim_whitespace=True, header=None)

# 只提取样品ID(第1列)和杂合度(第4列)
data = data[[0, 3]]
data.columns = ['SampleID', 'Heterozygosity']

# 获取 HNH_495 样本的杂合度值
hnh_495_heterozygosity = data[data['SampleID'] == 'HNH_495']['Heterozygosity'].values[0]

# 计算 HNH_495 和其他样本的杂合度平均值
data['AverageHeterozygosity'] = (data['Heterozygosity'] + hnh_495_heterozygosity) / 2

# 创建一列,用于表示 HNH_495 和其他样本的配对(如:HNH_495&另一个样本编号)
data['Pair'] = 'HNH_495&' + data['SampleID']

# 排除 HNH_495 与其自身的配对
result = data[data['SampleID'] != 'HNH_495'][['Pair', 'AverageHeterozygosity']]

# 将结果保存为 result.txt 文件
output_path = 'ave_het_result.txt'
result.to_csv(output_path, sep='\t', index=False, header=False)
import vcf
import pandas as pd

# 输入文件和目标个体
input_file = "chengduSCT_73.vcf"
target_individual = "HNH_334"
output_file = "variation_degree.csv"

# 打开VCF文件并提取所有数据到内存中
vcf_reader = vcf.Reader(open(input_file, 'r'))
individuals = vcf_reader.samples
if target_individual not in individuals:
    raise ValueError(f"目标个体 {target_individual} 不在 VCF 文件中。")

# 预加载所有位点的基因型数据
genotype_data = []
for record in vcf_reader:
    genotypes = {sample: record.genotype(sample).data.GT for sample in individuals}
    genotype_data.append(genotypes)

# 将基因型数据转换为 DataFrame,便于向量化处理
df_genotypes = pd.DataFrame(genotype_data)

# 初始化结果数据结构
results = []

# 逐个比较其他个体与目标个体的差异度
for individual in individuals:
    if individual == target_individual:
        continue

    # 选择目标个体和当前个体的基因型数据列
    target_genotypes = df_genotypes[target_individual]
    other_genotypes = df_genotypes[individual]

    # 计算相似性:统计位点两者都为"0/0"或"1/1"的数目
    matching_sites = ((target_genotypes == "0/0") & (other_genotypes == "0/0")) | \
                     ((target_genotypes == "1/1") & (other_genotypes == "1/1"))
    similarity = matching_sites.sum() / len(df_genotypes)

    # 计算差异度并保存结果
    variation_degree = 1 - similarity
    results.append({"Individual": individual, "Variation_Degree": variation_degree})

# 将结果保存到CSV文件
df_results = pd.DataFrame(results)
df_results.to_csv(output_file, index=False)
print(f"差异度计算完成,结果保存在 {output_file}")
8.计算亲缘系数
plink --bfile ../01.plink/SCT_hwe --genome --min 0 --out kinship

### 从kinship.genome文件中提取出想要配对个体与其他个体的全部近交系数(以HNH_495为例)
awk '$2 == 495 || $4 == 495' kinship.genome | awk '{print $10}' > ID_495_Kinship.txt
9.计算Froh
## 计算ROH
plink --bfile ../01.plink/SCT_hwe --homozyg --homozyg-kb 500 --homozyg-window-snp 20 --out ROH
## 计算Froh
python Froh.py
### 将想要的个体的数据提取出来
grep -Ff sample.id result.csv | awk '{print $1, $2}' > Froh.txt
# 如果想找到每个个体ROH中高于多少长度的ROH片段
cat sample.id | while read id; do awk -v id="$id" '$2 == id' ROH.hom | awk '$9 > 3000 {print $9}' | wc -l; done

FROH.py脚本

import pandas as pd

def process_froh(file_path, output_path):
    # 读取数据并跳过第一行的列名
    df = pd.read_csv(file_path, delim_whitespace=True, skiprows=1, header=None)

    # 组合样品ID:第一列和第二列用下划线连接
    sample_ids = df[0].astype(str) + "_" + df[1].astype(str)

    # 提取ROH长度值(第五列)
    roh_lengths = pd.to_numeric(df[4], errors='coerce')

    # 进行除法运算计算Froh
    froh_values = roh_lengths / 2258664.721

    # 创建新的DataFrame,第一列为组合后的样品ID,第二列为计算得到的Froh
    result_df = pd.DataFrame({
        'Sample_ID': sample_ids,
        'Froh': froh_values
    })

    # 保存结果为CSV文件
    result_df.to_csv(output_path, index=False)

# 示例调用
process_froh('ROH.hom.indiv', 'result.csv')
10.配对后ROH消除占比

(1)首先将总的ROH文件按照个体进行拆分

import pandas as pd

def split_file_by_individual(input_file):
    # Load the entire file into a pandas DataFrame to easily manipulate the data
    roh_data = pd.read_csv(input_file, delim_whitespace=True)

    # Loop through each individual and save their data to a separate file
    for iid, group in roh_data.groupby('IID'):
        # Define the output file name based on the individual ID
        file_name = f'{iid}.txt'

        # Add the header (column names) to the individual's DataFrame
        group_with_header = pd.concat([pd.DataFrame([roh_data.columns], columns=roh_data.columns), group])

        # Write the individual's data to a new file, including the header
        group_with_header.to_csv(file_name, sep='\t', index=False, header=False)
        print(f'File saved: {file_name}')

# Replace 'path/to/ROH.hom' with the actual file path
input_file_path = '../ROH.hom'
split_file_by_individual(input_file_path)

(1_1)单独拆分某个个体

import pandas as pd

def split_individual(input_file, target_fid, target_iid):
    # Load the entire file into a pandas DataFrame to easily manipulate the data
    roh_data = pd.read_csv(input_file, delim_whitespace=True)

    # Filter the data to keep only the rows corresponding to the target FID and IID
    individual_data = roh_data[(roh_data['FID'] == target_fid) & (roh_data['IID'] == target_iid)]

    if not individual_data.empty:
        # Define the output file name based on the individual FID and IID
        file_name = f'{target_iid}.txt'

        # Add the header (column names) to the individual's DataFrame
        individual_with_header = pd.concat([pd.DataFrame([roh_data.columns], columns=roh_data.columns), individual_data])

        # Write the individual's data to a new file, including the header
        individual_with_header.to_csv(file_name, sep='\t', index=False, header=False)
        print(f'File saved: {file_name}')
    else:
        print(f'No data found for FID {target_fid} and IID {target_iid}')

# Replace 'path/to/ROH.hom' with the actual file path
input_file_path = '../../04.ROH/ROH.hom'
target_fid = "HNH"  # Specify the FID you want to filter
target_iid = "616"    # Specify the IID you want to filter
split_individual(input_file_path, target_fid, target_iid)

(2)以495为寻求配对个体,找到495与其他个体之间的ROH相交区域

import os
import pandas as pd

# 定义读取数据的函数
def read_data(file_path):
    return pd.read_csv(file_path, sep='\t', usecols=[3, 6, 7], header=0, names=['CHR', 'POS1', 'POS2'])

# 定义寻找相交ROH的函数
def find_intersections(df1, df2):
    intersections = []
    for chr_num in pd.unique(df1['CHR']):
        df1_chr = df1[df1['CHR'] == chr_num]
        df2_chr = df2[df2['CHR'] == chr_num]
        for _, row1 in df1_chr.iterrows():
            for _, row2 in df2_chr.iterrows():
                start_max = max(row1['POS1'], row2['POS1'])
                end_min = min(row1['POS2'], row2['POS2'])
                if start_max < end_min:
                    kb = end_min - start_max
                    intersections.append((chr_num, start_max, end_min, kb))
    return intersections

# 加载基准数据(495.txt)
data_495 = read_data('../01.individual_ROH_file/495.txt')

# 读取sample.id文件,获取其他文件的路径
with open('sample.id', 'r') as f:
    sample_files = f.read().splitlines()

# 遍历每个样本文件,进行比对并保存结果
for sample_file in sample_files:
    data_sample = read_data(sample_file)  # 加载当前样本文件
    intersections = find_intersections(data_495, data_sample)  # 查找相交区域

    # 提取样本文件的基础文件名(去除路径)
    sample_basename = os.path.basename(sample_file).replace('.txt', '')

    # 生成输出文件名,格式为 495_xxx.txt
    output_file = f"495_{sample_basename}.txt"

    # 如果找到相交区域,保存结果,否则记录没有找到
    if intersections:
        # 将结果转换为DataFrame
        intersections_df = pd.DataFrame(intersections, columns=['CHR', 'POS1', 'POS2', 'KB'])
        # 保存结果到对应的文件
        intersections_df.to_csv(output_file, sep='\t', index=False)
        print(f"Saved intersections to {output_file}")
    else:
        # 如果没有找到相交区域,可以生成一个空文件,或者跳过
        print(f"No intersections found for {sample_file}")

(3)提取全部个体交集的染色体编号和位置坐标

ls *txt | tr "\t" "\n" | awk -F "." '{print $1}' | while read id ; do awk '{print $1 "\t" $2"-"$3}' $id.txt > $id.position.txt ; done

(4)生成个体配对的VCF文件,vcf.gz文件,csi索引文件批量化处理脚本

# 基准个体 HNH_495 的 ID
INDIVIDUAL1 = "HNH_495"

# 读取 sample.id 文件中的个体编号,保留原始个体名
SAMPLES = [line.strip() for line in open("sample.id1")]

# 定义所有的输出文件,去掉前缀 Pta 和 HNH 仅用于文件命名,个体名仍然保留原始
output_vcfs = [f"{INDIVIDUAL1.replace('Pta_', '').replace('HNH_', '')}_{sample.replace('Pta_', '').replace('HNH_', '')}.vcf.gz" for sample in SAMPLES]
output_indexes = [f"{vcf}.csi" for vcf in output_vcfs]

rule all:
    input:
        output_vcfs,  # 所有的压缩VCF文件作为最终目标
        output_indexes  # 所有的压缩VCF索引文件

# 规则1: 提取VCF文件中两个个体的数据
rule extract_vcf:
    input:
        vcf="/home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/test/12.NEW_pairing_program/01.plink/SCT.vcf"
    output:
        vcf="{name1}_{name2}.vcf"
    params:
        individual1=INDIVIDUAL1,
        individual2=lambda wildcards: f"HNH_{wildcards.name2}"  # 使用wildcards.name2 作为个体名的一部分
    shell:
        """
        echo "vcftools --vcf {input.vcf} --recode --recode-INFO-all --stdout --indv {params.individual1} --indv {params.individual2} > {output.vcf}"
        vcftools --vcf {input.vcf} --recode --recode-INFO-all --stdout --indv {params.individual1} --indv {params.individual2} > {output.vcf}
        """

# 规则2: 压缩VCF文件
rule compress_vcf:
    input:
        vcf="{name1}_{name2}.vcf"
    output:
        vcf_gz="{name1}_{name2}.vcf.gz"
    shell:
        """
        bcftools view {input.vcf} -Oz -o {output.vcf_gz}
        """

# 规则3: 为压缩后的VCF文件生成索引
rule index_vcf:
    input:
        vcf_gz="{name1}_{name2}.vcf.gz"
    output:
        vcf_gz_index="{name1}_{name2}.vcf.gz.csi"
    shell:
        """
        bcftools index {input.vcf_gz} -c
        """
       

上面那个脚本适合处理HNH这种标签的样本,处理不了Pta这种样本,下面这个脚本处理Pta样本

# 基准个体 HNH_495 的 ID
INDIVIDUAL1 = "HNH_495"

# 读取 sample.id 文件中的个体编号,保留原始个体名
SAMPLES = [line.strip() for line in open("sample.id1_2")]

# 定义所有的输出文件,去掉前缀 Pta 和 HNH 仅用于文件命名,个体名仍然保留原始
output_vcfs = [f"{INDIVIDUAL1.replace('Pta_', '').replace('HNH_', '')}_{sample.replace('Pta_', '').replace('HNH_', '')}.vcf.gz" for sample in SAMPLES]
output_indexes = [f"{vcf}.csi" for vcf in output_vcfs]

rule all:
    input:
        output_vcfs,  # 所有的压缩VCF文件作为最终目标
        output_indexes  # 所有的压缩VCF索引文件

# 规则1: 提取VCF文件中两个个体的数据
rule extract_vcf:
    input:
        vcf="/home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/test/12.NEW_pairing_program/01.plink/SCT.vcf"
    output:
        vcf="{name1}_{name2}.vcf"
    params:
        individual1=INDIVIDUAL1,
        individual2=lambda wildcards: wildcards.name2 if wildcards.name2.startswith(('HNH_', 'Pta_')) else f"Pta_{wildcards.name2}"  # 只为没有前缀的个体补充Pta_
    shell:
        """
        echo "vcftools --vcf {input.vcf} --recode --recode-INFO-all --stdout --indv {params.individual1} --indv {params.individual2} > {output.vcf}"
        vcftools --vcf {input.vcf} --recode --recode-INFO-all --stdout --indv {params.individual1} --indv {params.individual2} > {output.vcf}
        """

# 规则2: 压缩VCF文件
rule compress_vcf:
    input:
        vcf="{name1}_{name2}.vcf"
    output:
        vcf_gz="{name1}_{name2}.vcf.gz"
    shell:
        """
        bcftools view {input.vcf} -Oz -o {output.vcf_gz}
        """

# 规则3: 为压缩后的VCF文件生成索引
rule index_vcf:
    input:
        vcf_gz="{name1}_{name2}.vcf.gz"
    output:
        vcf_gz_index="{name1}_{name2}.vcf.gz.csi"
    shell:
        """
        bcftools index {input.vcf_gz} -c
        """

(5)根据染色体编号和位置坐标提取出每个ROH片段对应的vcf文件(这个过程特别漫长)

import pandas as pd

# 读取样本ID
samples = pd.read_csv("sample.id2", header=None, names=["sample"])

# 创建输出文件列表
output_files = []

for sample in samples['sample']:
    # 载入每个样本的位置信息
    positions = pd.read_csv(f"{sample}.position.txt", sep="\t", header=0)

    # 创建该样本的输出文件
    for index, row in positions.iterrows():
        output_files.append(f"{sample}.CHR{row['CHR']}.{row['POS1-POS2']}.vcf")

rule all:
    input:
        output_files

rule filter_vcf:
    input:
        vcf="{sample}.vcf.gz"
    output:
        vcf="{sample}.CHR{chr}.{range}.vcf"
    params:
        region=lambda wildcards: f"{wildcards.chr}:{wildcards.range}"  # 指定拆分的染色体区间
    shell:
        """
        bcftools filter {input.vcf} --regions {params.region} > {output.vcf}
        """

(6)生成两个过渡文件,并建一个新的文件夹combine_ROH

cat sample.id2 | while read id ; do ls $id.CHR* | tr "\t" "\n" > $id.CHR.txt ; done
cat sample.id2 | while read id ; do awk '{print "../" $1}' $id.CHR.txt > $id.chr.txt ; done
mkdir combine_ROH
mv ../*chr.txt ./

(7)生成预测子代的新vcf文件

def recombine_genotypes(gt1, gt2):
    # 根据两个基因型来重组新的基因型
    if gt1 == "0/1" or gt2 == "0/1":
        return "0/1"
    elif (gt1 == "0/0" and gt2 == "1/1") or (gt1 == "1/1" and gt2 == "0/0"):
        return "0/1"
    elif gt1 == "1/1" and gt2 == "1/1":
        return "1/1"
    else:
        return "0/0"

def process_vcf(input_filename, output_filename):
    with open(input_filename, 'r') as file_in, open(output_filename, 'w') as file_out:
        for line in file_in:
            if line.startswith('#'):
                if line.startswith('#CHROM'):
                    # 修改列标题
                    headers = line.strip().split('\t')
                    headers = headers[:9] + ['forecast']
                    file_out.write('\t'.join(headers) + '\n')
                else:
                    # 复制注释行
                    file_out.write(line)
            else:
                # 处理基因型数据
                parts = line.strip().split('\t')
                
                # 检查列数是否足够(至少11列:9个标准字段加2个基因型字段)
                if len(parts) < 11:
                    print(f"Warning: Skipping line in {input_filename} due to insufficient columns: {line.strip()}")
                    continue
                
                # 提取基因型数据
                gt1 = parts[9].strip()
                gt2 = parts[10].strip()
                
                # 防止无效的GT字段
                if gt1 not in ["0/0", "0/1", "1/1"] or gt2 not in ["0/0", "0/1", "1/1"]:
                    print(f"Warning: Skipping line with invalid genotype format in {input_filename}: {line.strip()}")
                    continue
                
                forecast_gt = recombine_genotypes(gt1, gt2)
                new_line = parts[:9] + [forecast_gt]
                file_out.write('\t'.join(new_line) + '\n')

def batch_process(file_list):
    with open(file_list, 'r') as files:
        for file_path in files:
            file_path = file_path.strip()
            output_filename = file_path.split('/')[-1]  # 提取文件名
            process_vcf(file_path, output_filename)

def process_all_vcf_from_sample_list(sample_list_filename):
    # 读取sample.id文件中的每一个chr.txt文件
    with open(sample_list_filename, 'r') as sample_list:
        for sample_file in sample_list:
            sample_file = sample_file.strip()  # 去除换行符
            print(f"Processing {sample_file}...")
            batch_process(sample_file)

# 输入文件名列表文件 (即 sample.id)
sample_list = 'sample.id'

# 调用批量处理函数,处理所有chr.txt文件
process_all_vcf_from_sample_list(sample_list)

(8)进行plink处理,转换为plink格式,得先新建一个ped文件夹

#!/bin/bash

# 读取 sample.id 文件中的每个个体编号
while read sample; do
    echo "Processing VCF files for sample: $sample"

    # 查找当前个体名对应的所有 VCF 文件
    for vcf_file in ${sample}.CHR*.vcf; do
        if [[ -f "$vcf_file" ]]; then
            # 提取 CHR 和范围信息,并将文件名中的 '.' 替换为 '_'
            chr=$(echo $vcf_file | awk -F '.' '{print $2}')
            range=$(echo $vcf_file | awk -F '.' '{print $3}')
            output_prefix="ped/${sample}_CHR${chr}_${range}"

            # 调用 plink 进行处理,并输出以个体名为前缀的结果
            plink --vcf "$vcf_file" --make-bed --out "$output_prefix"

            echo "Processed $vcf_file -> Output: $output_prefix"
        else
            echo "No VCF file found for sample: $sample with pattern ${sample}.CHR*.vcf"
        fi
    done
done < sample.id1

(9)转换为map/ped格式,得先进入cd ped文件夹中

ls *.bed | tr "\t" "\n" | awk -F "." '{print $1}' | while read id ; do plink --bfile $id --recode --out $id ; done

(10)新建ROH文件夹,计算新的预测后代的ROH

ls *.bed | tr "\t" "\n" | awk -F "." '{print $1}' | while read id ; do plink --bfile $id --homozyg --homozyg-kb 500 --homozyg-window-snp 20 --out ROH/$id; done
  • (11)得到ROH保留的数量,这个sample.id1文件中的数据是个体编号,最好根据你要处理的数据做一下排序,方便统计*
### 在此之前要先进入ROH文件夹,删除其中一些文件
rm *indiv *summary *log *nosex
### 计算数量
cat sample.id1 | while read id ; do find . -name "$id*" -type f -size +200c | wc -l ; done
### 也可以生成文件,保留数据
cat sample.id1 | while read id ; do find . -name "$id*" -type f -size +200c | wc -l >> number.txt; done
### 统计每个配对ROH文件的数量,计算保留率
cat sample.id1 | while read id ; do ls $id* | tr "\t" "\n" | wc -l >> all_roh.txt ; done

我生成了一个完整的分析ROH固定的脚本

这个脚本在运行前需要做一些处理如下:
1.ls ../01.individual_ROH_file/*.txt | tr "\t" "\n" > sample.id
vi sample.id (将要配对的个体删除)
2.cp ../../01.plink/sample.id sample.id12
vi sample.id12 (将要配对的个体删除)
3.vi Snakefile_step4 (修改为正确的配对编号)
4.vi workflow.sh (修改为正确的配对编号)

#!/bin/bash

# Step 1: 找到配对个体与其他个体之间的ROH相交区域
echo "Step 1: 找到配对个体与其他个体之间的ROH相交区域"
python3 - <<EOF
import os
import pandas as pd

def read_data(file_path):
    return pd.read_csv(file_path, sep='\t', usecols=[3, 6, 7], header=0, names=['CHR', 'POS1', 'POS2'])

def find_intersections(df1, df2):
    intersections = []
    for chr_num in pd.unique(df1['CHR']):
        df1_chr = df1[df1['CHR'] == chr_num]
        df2_chr = df2[df2['CHR'] == chr_num]
        for _, row1 in df1_chr.iterrows():
            for _, row2 in df2_chr.iterrows():
                start_max = max(row1['POS1'], row2['POS1'])
                end_min = min(row1['POS2'], row2['POS2'])
                if start_max < end_min:
                    kb = end_min - start_max
                    intersections.append((chr_num, start_max, end_min, kb))
    return intersections

data_521 = read_data('../01.individual_ROH_file/521.txt')

with open('sample.id', 'r') as f:
    sample_files = f.read().splitlines()

for sample_file in sample_files:
    data_sample = read_data(sample_file)
    intersections = find_intersections(data_521, data_sample)
    sample_basename = os.path.basename(sample_file).replace('.txt', '')
    output_file = f"521_{sample_basename}.txt"
    if intersections:
        intersections_df = pd.DataFrame(intersections, columns=['CHR', 'POS1', 'POS2', 'KB'])
        intersections_df.to_csv(output_file, sep='\t', index=False)
        print(f"Saved intersections to {output_file}")
    else:
        print(f"No intersections found for {sample_file}")
EOF

# Step 2: 生成相交个体编号id文件
echo "Step 2: 生成相交个体编号id文件"
ls *txt | tr "\t" "\n" | awk -F "." '{print $1}' > sample.id2

# Step 3: 提取全部个体交集的染色体编号和位置坐标
echo "Step 3: 提取全部个体交集的染色体编号和位置坐标"
ls  *_*.txt | tr "\t" "\n" | awk -F "." '{print $1}' | while read id ; do awk '{print $1 "\t" $2"-"$3}' $id.txt > $id.position.txt ; done

# Step 4: 运行 Snakemake 生成个体配对的VCF文件,使用30个线程
echo "Step 4: 运行 Snakemake 生成个体配对的VCF文件"
snakemake -s Snakefile_step4 -j 30

# Step 5: 运行 Snakemake 根据染色体编号和位置坐标提取ROH片段的VCF文件,使用30个线程
echo "Step 5: 运行 Snakemake 提取ROH片段的VCF文件"
snakemake -s Snakefile_step5 -j 30

# Step 6-17: 合并ROH、plink处理、计算ROH等步骤
echo "Step 6-17: 合并ROH、Plink处理、计算ROH"
# 执行你剩余的步骤,按顺序执行这些shell命令
cat sample.id2 | while read id ; do ls $id.CHR* | tr "\t" "\n" > $id.CHR.txt ; done
cat sample.id2 | while read id ; do awk '{print "../" $1}' $id.CHR.txt > $id.chr.txt ; done
cd combine_ROH
mv ../*chr.txt ./
ls *txt | tr "\t" "\n" >sample.id


python3 - <<EOF
def recombine_genotypes(gt1, gt2):
    if gt1 == "0/1" or gt2 == "0/1":
        return "0/1"
    elif (gt1 == "0/0" and gt2 == "1/1") or (gt1 == "1/1" and gt2 == "0/0"):
        return "0/1"
    elif gt1 == "1/1" and gt2 == "1/1":
        return "1/1"
    else:
        return "0/0"

def process_vcf(input_filename, output_filename):
    with open(input_filename, 'r') as file_in, open(output_filename, 'w') as file_out:
        for line in file_in:
            if line.startswith('#'):
                if line.startswith('#CHROM'):
                    headers = line.strip().split('\t')
                    headers = headers[:9] + ['forecast']
                    file_out.write('\t'.join(headers) + '\n')
                else:
                    file_out.write(line)
            else:
                parts = line.strip().split('\t')
                if len(parts) < 11:
                    continue
                gt1 = parts[9].strip()
                gt2 = parts[10].strip()
                if gt1 not in ["0/0", "0/1", "1/1"] or gt2 not in ["0/0", "0/1", "1/1"]:
                    continue
                forecast_gt = recombine_genotypes(gt1, gt2)
                new_line = parts[:9] + [forecast_gt]
                file_out.write('\t'.join(new_line) + '\n')

def batch_process(file_list):
    with open(file_list, 'r') as files:
        for file_path in files:
            file_path = file_path.strip()
            output_filename = file_path.split('/')[-1]
            process_vcf(file_path, output_filename)

def process_all_vcf_from_sample_list(sample_list_filename):
    with open(sample_list_filename, 'r') as sample_list:
        for sample_file in sample_list:
            sample_file = sample_file.strip()
            batch_process(sample_file)

sample_list = 'sample.id'
process_all_vcf_from_sample_list(sample_list)
EOF

echo "Step: 处理VCF文件并生成PLINK格式"
cp ../sample.id2 sample.id1

# 读取 sample.id1 文件中的每个个体编号
while read sample; do
    echo "Processing VCF files for sample: $sample"

    # 查找当前个体名对应的所有 VCF 文件
    for vcf_file in ${sample}.CHR*.vcf; do
        if [[ -f "$vcf_file" ]]; then
            # 提取 CHR 和范围信息,并将文件名中的 '.' 替换为 '_'
            chr=$(echo $vcf_file | awk -F '.' '{print $2}')
            range=$(echo $vcf_file | awk -F '.' '{print $3}')
            output_prefix="ped/${sample}_CHR${chr}_${range}"

            # 调用 plink 进行处理,并输出以个体名为前缀的结果
            plink --vcf "$vcf_file" --make-bed --out "$output_prefix"

            echo "Processed $vcf_file -> Output: $output_prefix"
        else
            echo "No VCF file found for sample: $sample with pattern ${sample}.CHR*.vcf"
        fi
    done
done < sample.id1
cd ped
ls *.bed | tr "\t" "\n" | awk -F "." '{print $1}' | while read id ; do plink --bfile $id --recode --out $id ; done
ls *.bed | tr "\t" "\n" | awk -F "." '{print $1}' | while read id ; do plink --bfile $id --homozyg --homozyg-kb 500 --homozyg-window-snp 20 --out ROH/$id; done
cd ROH
rm *indiv
rm *summary
rm *log
rm *nosex
cp ../../sample.id1 ./
cp ../../../sample.id12 sample.id

python3 - <<EOF
import pandas as pd

def reorder_sample_id(sample_id_file, sample_id1_file, output_file):
    sample_id_df = pd.read_csv(sample_id_file, header=None, names=['HNH_ID'])
    sample_id1_df = pd.read_csv(sample_id1_file, header=None, names=['ID_521'])
    sample_id_df['XXX'] = sample_id_df['HNH_ID'].str.split('_').str[1]
    sample_id1_df['XXX'] = sample_id1_df['ID_521'].str.split('_').str[1]
    merged_df = pd.merge(sample_id_df, sample_id1_df, on='XXX', how='left')
    merged_df[['ID_521']].to_csv(output_file, header=False, index=False)

reorder_sample_id('sample.id', 'sample.id1', 'sample.id2')
EOF

cat sample.id2 | while read id ; do find . -name "$id*" -type f -size +200c | wc -l >> number.txt; done
cat sample.id2 | while read id ; do ls $id* | tr "\t" "\n" | wc -l >> all_roh.txt ; done
# 将两个文件中的数字进行除法计算,并保存结果
echo "Step: 计算除法并保存结果到 result.txt"
python3 - <<EOF
with open('number.txt', 'r') as f1, open('all_roh.txt', 'r') as f2:
    numbers = [float(line.strip()) for line in f1]
    all_roh = [float(line.strip()) for line in f2]

if len(numbers) != len(all_roh):
    raise ValueError("两个文件的行数不一致,无法逐行对应计算。")

results = [n / r for n, r in zip(numbers, all_roh)]

with open('result.txt', 'w') as f3:
    for result in results:
        f3.write(f"{result}\n")
EOF

cd ../
rm *ped
rm *nosex
rm *map
rm *log
rm *fam
rm *bim
rm *bed
cd /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/test/12.NEW_pairing_program/05.Pair_ROH/02.intersection_ROH
rm *vcf
rm *vcf.gz
rm *csi
rm *txt
cd /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/test/12.NEW_pairing_program/05.Pair_ROH/02.intersection_ROH/combine_ROH
rm *txt
rm *vcf
echo "All steps completed!"

Snakefile_step4

INDIVIDUAL1 = "HNH_334"
SAMPLES = [line.strip() for line in open("sample.id12")]

output_vcfs = [f"{INDIVIDUAL1.replace('Pta_', '').replace('HNH_', '')}_{sample.replace('Pta_', '').replace('HNH_', '')}.vcf.gz" for sample in SAMPLES]
output_indexes = [f"{vcf}.csi" for vcf in output_vcfs]

rule all:
    input:
        output_vcfs,  # All compressed VCF files
        output_indexes  # All VCF index files

rule extract_vcf:
    input:
        vcf="/home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/test/12.NEW_pairing_program/01.plink/sct.vcf"
    output:
        vcf="{name1}_{name2}.vcf"
    params:
        individual1=INDIVIDUAL1,
        individual2=lambda wildcards: f"HNH_{wildcards.name2}"
    shell:
        """
        vcftools --vcf {input.vcf} --recode --recode-INFO-all --stdout --indv {params.individual1} --indv {params.individual2} > {output.vcf}
        """

rule compress_vcf:
    input:
        vcf="{name1}_{name2}.vcf"
    output:
        vcf_gz="{name1}_{name2}.vcf.gz"
    shell:
        """
        bcftools view {input.vcf} -Oz -o {output.vcf_gz}
        """

rule index_vcf:
    input:
        vcf_gz="{name1}_{name2}.vcf.gz"
    output:
        vcf_gz_index="{name1}_{name2}.vcf.gz.csi"
    shell:
        """
        bcftools index {input.vcf_gz} -c
        """

Snakefile_step5

#Snakefile_step5
import pandas as pd

# 定义规则,首先读取 sample.id2 中的样本ID
samples = [line.strip() for line in open("sample.id2")]

# 读取每个样本对应的位置信息并生成输出VCF文件的路径
def get_output_files():
    output_files = []
    for sample in samples:
        # 读取每个样本的位置信息,并将 "POS1-POS2" 列拆分为两列
        positions = pd.read_csv(f"{sample}.position.txt", sep="\t")
        # 拆分 POS1-POS2 列为 POS1 和 POS2
        positions[['POS1', 'POS2']] = positions['POS1-POS2'].str.split('-', expand=True)
        # 转换 POS1 和 POS2 为整数
        positions['POS1'] = pd.to_numeric(positions['POS1'])
        positions['POS2'] = pd.to_numeric(positions['POS2'])

        # 为每个位置生成输出 VCF 文件名
        for index, row in positions.iterrows():
            output_files.append(f"{sample}.CHR{row['CHR']}.{row['POS1']}-{row['POS2']}.vcf")
    return output_files

# 全局规则,要求所有生成的 VCF 文件
rule all:
    input:
        get_output_files()

# 针对每个 VCF 文件的过滤规则
rule filter_vcf:
    input:
        vcf="{sample}.vcf.gz"
    output:
        vcf="{sample}.CHR{chr}.{pos_range}.vcf"
    params:
        region=lambda wildcards: f"{wildcards.chr}:{wildcards.pos_range}"  # 使用正确的格式 {chr}:{start}-{end}
    shell:
        """
        bcftools filter {input.vcf} --regions {params.region} > {output.vcf}
        """
4b2dabf40fc2b12651a84949561ea74.png
11.计算load

(1)计算load负荷需要进行的前期数据处理

#1-5步是计算AAmatrix得分
###1.将gff文件转换为gtf(以印支虎为例)
gffread genomic.gff -T -o India_tiger.gtf
###2.将gtf数据转换为GenePred file数据类型
gtfToGenePred -genePredExt India_tiger.gtf India_tiger_refGene.txt
###3.生成转录组信息文件
perl /home/yjc/Software/annovar/retrieve_seq_from_fasta.pl --format refGene --seqfile GCF_018350195.1_P.tigris_Pti1_mat1.1_genomic.fna India_tiger_refGene.txt --outfile India_tiger_refGeneMrna.fa
### 4.将VCF文件转换为ANNOVAR软件可读取的数据类型mit文件
perl /home/yjc/Software/annovar/convert2annovar.pl --format vcf4old India_tiger_maf005.vcf --outfile mit
### 5.计算AAmatrix得分,得分高于150视为有风险
perl /home/yjc/Software/annovar/annotate_variation.pl mit /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/test/09.all_species/08.India_tiger/ref/ --aamatrixfile /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/ref/NOVAR/grantham.matrix --outfile India_tiger -buildver India_tiger
#6-9步是进行功能缺失位点数据分析
###6.在/home/yjc/Software/miniconda3/share/snpeff-5.2-0目录下有个snpEff.config文件,在这里最后一行修改内容
#genome, version
ref_2.genome : ref_2
data.dir = /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/AT_data/ref/
###7.在ref中新建一个ref_2文件夹,里面放入4个文件,genes.gff、sequences.fa、cds.fa、protein.fa
##这些文件能下载就下载,无法下载就自动生成
gffread in.gff3 -g sequences.fa -x cds.fa
gffread genes.gff -g sequences.fa -y protein.fa
### 8.建库(-Xmx8g这个参数很重要)
snpEff -Xmx8g build -gff3 -v ref_4
### 9.分析数据,生成功能缺失突变位点文件
snpEff ref_2 AT.recode.vcf > AT.ann


### 生成计算150得分的格式mit文件
perl /home/yjc/Software/annovar/convert2annovar.pl --format vcf4old ../01.plink/SCT.vcf --outfile mit
### 计算AAmatrix得分,得分高于150视为有风险
perl /home/yjc/Software/annovar/annotate_variation.pl mit /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/ref/NOVAR/ --aamatrixfile ./grantham.matrix --outfile ./SCT -buildver SCT

(2) 将评分高于150的位点筛选出来

# 打开输入文件和输出文件
with open('SCT.exonic_variant_function', 'r') as infile, open('filtered_SCT.exonic_variant_function', 'w') as outfile:
    for line in infile:
        # 检查行中是否包含 AAMatrix 评分
        if 'AAMatrix' in line:
            # 提取 AAMatrix 的值
            try:
                score = int(line.split('AAMatrix=')[1].split(',')[0])
                # 如果 AAMatrix 值大于 150,将该行写入输出文件
                if score > 150:
                    outfile.write(line)
            except (IndexError, ValueError):
                # 如果解析 AAMatrix 评分时出错,跳过该行
                continue

(3)从生成的filtered_SCT.exonic_variant_function文件中获取染色体和位点坐标

awk '{print $5,$6}' filtered_SCT.exonic_variant_function > position.txt

(4)将总vcf文件中的load位点提取出来,组成新的vcf文件

vcftools --vcf ../01.plink/SCT.vcf --positions position.txt --recode --out SCT_position.vcf

(5)将vcf文件进行个体拆分

#!/bin/bash

# 输入文件
input_vcf="SCT_position.vcf.recode.vcf"
sample_list="sample.id1"

# 读取样本列表
while IFS= read -r sample
do
    # 输出文件名
    output_vcf="${sample}.vcf"

    # 使用 bcftools 拆分 VCF 文件
    bcftools view -s "$sample" -Oz -o "${output_vcf}.gz" $input_vcf

    # 解压缩生成的 VCF.GZ 文件
    gunzip "${output_vcf}.gz"

    echo "Generated ${output_vcf}"
done < "$sample_list"

(6)统计出每个个体的潜在负荷0/1和实现负荷1/1的数量,以及遗传负荷比例,需要统计的话,最好sample.id文件最好按照统计顺序进行排序

cat sample.id | while read id ; do grep "0/1" $id.vcf | wc -l >> het.txt ;done
cat sample.id | while read id ; do grep "1/1" $id.vcf | wc -l >> hom.txt ;done
### 计算有害遗传负荷比例就按照公式计算
### 要利用脚本计算radio需要先将hom.txt和het.txt合并成一个文件,方便后续计算
paste hom.txt het.txt > load.txt

利用脚本计算radio,公式为(2 * hom) / (2 * hom + het)

# 打开输入文件 load.txt 和输出文件 output.txt
with open('load.txt', 'r') as infile, open('radio_output.txt', 'w') as outfile:
    # 遍历每一行
    for line in infile:
        # 跳过空行
        if not line.strip():
            continue

        # 分割每一行数据,假设数据是以空格或制表符分隔
        hom, het = map(float, line.split())

        # 计算公式 (2 * hom) / (2 * hom + het)
        result = (2 * hom) / (2 * hom + het)

        # 将结果写入到输出文件
        outfile.write(f"{result}\n")

print("计算完成,结果已写入 radio_output.txt 文件中。")

计算完成load后进行统计,只提取出需要的编号的信息,而且数据顺序要一致(最后粘贴结果)

### 将结果对应的编号加上
paste sample.id radio_output.txt > radio.txt
### 新生成一个sample文件,删除不需要的编号
cp sample.id sample1.id
vi sample1.id   (删除不需要的编号)
### 将需要数据直接提取出来打印到屏幕
cat sample1.id | while read id ; do grep "$id" radio.txt | awk '{print $2}'; done

(7)将LOF功能缺失位点统计出来

# 定义要匹配的关键字
keywords = ["stop_gained", "splice_acceptor_variant", "splice_donor_variant"]

# 定义要处理的文件名
input_file = 'SC_sh2.ann'
output_file = 'SC_sh2_filtered.ann'

# 处理文件
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
        # 检查行中是否包含任意一个关键字
        if any(keyword in line for keyword in keywords):
            outfile.write(line)

print(f'Processed {input_file}, results written to {output_file}')

(8)生成LOF位点染色体编号和位置信息文件

awk '{print $1,$2}' SC_sh2_filtered.ann > position1.txt

(9)现在我已经得到了有害突变位点的坐标和功能缺失位点的坐标,我想看一下是不是有位点坐标是相同的

# 定义输入和输出文件
file1 = 'position.txt'
file2 = 'position1.txt'
output_file = 'same_position.txt'

# 读取两个文件中的染色体编号和SNP位点坐标
def read_positions(filename):
    positions = set()
    with open(filename, 'r') as f:
        for line in f:
            chrom, pos = line.strip().split()
            positions.add((chrom, pos))
    return positions

# 获取两个文件中的位置信息
positions_file1 = read_positions(file1)
positions_file2 = read_positions(file2)

# 找出相同的位点
same_positions = positions_file1.intersection(positions_file2)

# 将相同的位点写入输出文件
with open(output_file, 'w') as outfile:
    for chrom, pos in same_positions:
        outfile.write(f'{chrom}\t{pos}\n')

print(f'相同的位点已写入 {output_file}')

(10)如果没有相同的位点就将两个position.txt文件进行合并,再将遗传负荷位点从整个基因组中扣出来

cat position.txt position1.txt > position2.txt
vcftools --vcf ../01.plink/SCT.vcf --positions position2.txt --recode --out SCT_position2.vcf

** 完事再重新返回第五步将 “”vcf文件进行个体拆分“”进行后续分析**

12.计算配对后实现负荷的生成比例和消除比例(直接利用position总vcf文件)

计算实现负荷的清除比例和实现负荷的生成比例
个体A某个位点为1/1,个体B在这个位点为1/1,那么实现负荷的生成比例为1;如果个体A在某个位点为1/1,个体B在这个位点为0/1,那么实现负荷的生成比例为0.5;如果个体A在某个位点为0/1,个体B在这个位点为1/1,那么实现负荷的生成比例为0.5;还有如果个体A在某个位点为0/1,个体B在某个位点为0/1,那么实现负荷的生成比例为0.25。将所有这些点的值相加求平均数就得到了两个个体配对后的实现负荷生成比例
如果A某个位点为1/1,个体B在这个位点为0/1,那么消除概率为0.5;如果个体A某个位点为0/1,个体B在这个位点为1/1,那么消除概率为0.5,;如果个体A在某个位点为0/1,个体B在某个位点为0/1,那么消除概率为0.25。将所有的可能进行加和求平均,就得到了两个个体配对后的实现负荷消除比例

import pandas as pd
import numpy as np

# 读取VCF文件并提取基因型数据
vcf_file = "../06.Load/SCT_position.vcf.recode.vcf"
with open(vcf_file, 'r') as f:
    lines = [line.strip() for line in f if not line.startswith('##')]

header = lines[0].split('\t')
data = [line.split('\t') for line in lines[1:]]
df = pd.DataFrame(data, columns=header)

# 检查列名
print("Columns in the DataFrame:")
print(df.columns)

# 去除可能的空格
df.columns = df.columns.str.strip()

# 提取SC_sh2个体和其他个体的基因型数据
target_individual = 'HNH_495'
other_individuals = [col for col in df.columns[9:] if col != target_individual]

target_genotypes = df[['#CHROM', 'POS', target_individual]]
other_genotypes = df[['#CHROM', 'POS'] + other_individuals]

# 定义计算实现负荷生成比例和消除比例的函数
def calculate_load_ratios(target_genotypes, other_genotypes):
    results = []
    for other in other_individuals:
        generation_ratios = []
        elimination_ratios = []
        for i, row in target_genotypes.iterrows():
            tgt_gt = row[target_individual]
            other_gt = other_genotypes.at[i, other]
            # 只统计符合条件的位点
            if tgt_gt == '1/1' and other_gt == '1/1':
                generation_ratios.append(1.0)
            elif tgt_gt == '1/1' and other_gt == '0/1':
                generation_ratios.append(0.5)
                elimination_ratios.append(0.5)
            elif tgt_gt == '0/1' and other_gt == '1/1':
                generation_ratios.append(0.5)
                elimination_ratios.append(0.5)
            elif tgt_gt == '0/1' and other_gt == '0/1':
                generation_ratios.append(0.25)
                elimination_ratios.append(0.25)
            elif tgt_gt == '1/1' and other_gt == '0/0':
                elimination_ratios.append(1.0)
            elif tgt_gt == '0/1' and other_gt == '0/0':
                elimination_ratios.append(0.5)

        if generation_ratios:
            generation_ratio_avg = np.mean(generation_ratios)
        else:
            generation_ratio_avg = 0.0

        if elimination_ratios:
            elimination_ratio_avg = np.mean(elimination_ratios)
        else:
            elimination_ratio_avg = 0.0

        results.append((other, generation_ratio_avg, elimination_ratio_avg))
    return results

# 计算负荷生成比例和消除比例
results = calculate_load_ratios(target_genotypes, other_genotypes)

# 转换结果为DataFrame
df_results = pd.DataFrame(results, columns=['individual', 'generation_ratio', 'elimination_ratio'])

# 输出最佳配对方案
best_generation_pair = df_results.loc[df_results['generation_ratio'].idxmin()]
best_elimination_pair = df_results.loc[df_results['elimination_ratio'].idxmax()]

best_generation_result = f"最佳实现负荷生成最小的配对方案是与个体 {best_generation_pair['individual']} 配对,实现负荷生成比例为 {best_generation_pair['generation_ratio']:.2f}\n"
best_elimination_result = f"最佳实现负荷消除最大的配对方案是与个体 {best_elimination_pair['individual']} 配对,实现负荷消除比例为 {best_elimination_pair['elimination_ratio']:.2f}\n"

# 显示详细结果
detailed_results = df_results.to_string(index=False)

# 将结果保存到文件中
with open('MTKL_result.txt', 'w') as f:
    f.write(best_generation_result)
    f.write(best_elimination_result)
    f.write("\n详细结果:\n")
    f.write(detailed_results)

# 打印结果到控制台
print(best_generation_result)
print(best_elimination_result)
print("详细结果:")
print(detailed_results)

完成后再进行数据统计,尝试使用代码直接将结果打印到屏幕上

### 生成新的样品名文件,并删除没用的样品名
cp sample.id sample1.id
vi sample1.id
### 展示完整的结果,打印到屏幕上后直接复制粘贴到excel中
cat sample1.id | while read id ; do grep "^   $id" MTKL_result.txt | awk '{print $2,$3}'; done
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容