1.下载数据脚本
# 定义样本列表
SAMPLES = ["SRR1214199", "SRR1214198",
"SRR1144866", "SRR1144863", "SRR1144860", "SRR1144852",
"SRR1144850", "SRR1144846"]
# 定义下载规则
rule all:
input:
expand("{sample}/{sample}_1.fastq.gz", sample=SAMPLES),
expand("{sample}/{sample}_2.fastq.gz", sample=SAMPLES)
# 使用prefetch下载数据
rule download_SRR:
output:
"{sample}/{sample}.sra"
shell:
"""
prefetch -X 100G {wildcards.sample}
"""
# 将sra转换为fastq文件
rule translate_fastq:
input:
"{sample}/{sample}.sra"
output:
"{sample}/{sample}_1.fastq.gz",
"{sample}/{sample}_2.fastq.gz"
shell:
"""
fastq-dump {input} --split-3 --gzip -O {wildcards.sample}/
"""
2.进行ROH的计算
plink --vcf SCT.recode.vcf --make-bed --out SCT
plink --bfile SCT --recode --out SCT
plink --bfile SCT --homozyg --homozyg-density 20 --homozyg-gap 100 --homozyg-kb 500 --homozyg-snp 50 --homozyg-window-het 3 --homozyg-window-snp 50 --homozyg-window-threshold 0.05 --homozyg-window-missing 5 --out SCT
awk '{print $2,$5}' SCT.hom.indiv > SCT.txt
echo "AT.txt" > name.id
import pandas as pd
import os
def read_filenames_from_id_file(id_file_path):
with open(id_file_path, 'r') as file:
filenames = file.read().splitlines()
return filenames
def process_files(filenames, base_path=''):
result_df = pd.DataFrame()
for filename in filenames:
file_path = os.path.join(base_path, filename)
# 读取数据,此处假设数据是以空格分隔的
df = pd.read_csv(file_path, sep="\s+", header=None, usecols=[1])
# 尝试将数据转换为浮点数,以确保数学运算可以进行
df[1] = pd.to_numeric(df[1], errors='coerce')
# 进行除法运算
df[1] = df[1] / 2258664.721
column_name = os.path.basename(file_path).split('.')[0]
result_df[column_name] = df[1]
result_df.to_csv('result.csv', index=False)
# 示例路径,根据实际情况调整
filenames = read_filenames_from_id_file('name.id')
process_files(filenames)
2.计算het
plink --bfile SCT1_SNP_ind_maf0.01_LD0.9 --export vcf --out India_tiger_maf0.01_LD0.9 --allow-extra-chr
vcftools --vcf SCT.recode.vcf --het --out SCT
sed '1d' SCT.het | awk '{print $1,"\t",$2,"\t",$4,"\t",($4-$2)/2258664721}' > SCT_het.txt
3.计算load
perl /home/yjc/Software/annovar/convert2annovar.pl --format vcf4old SCT.recode.vcf --outfile SCT_mit
perl /home/yjc/Software/annovar/annotate_variation.pl SCT_mit /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/ref/test/ --aamatrixfile /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/ref/test/grantham.matrix --outfile sct -buildver SCT
grep "AAMatrix" sct.exonic_variant_function > AAMatrix.txt
## 脚本(其中sample.id是AAMatrix)
import re
# 读取样本ID文件
sample_id_file = 'sample.id'
# 读取样本ID列表
with open(sample_id_file, 'r') as f:
sample_ids = [line.strip() for line in f]
# 定义处理单个文件的函数
def process_file(sample_id):
input_file = f'{sample_id}.txt'
output_file = f'{sample_id}_filtered.txt'
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
# 使用正则表达式提取AAMatrix的值
match = re.search(r'AAMatrix=(\d+)', line)
if match:
# 将值转换为整数
aamatrix_value = int(match.group(1))
# 判断AAMatrix值是否大于150
if aamatrix_value > 150:
outfile.write(line)
print(f'Processed {input_file}, results written to {output_file}')
# 批量处理所有样本文件
for sample_id in sample_ids:
process_file(sample_id)
awk '{print $5,$6}' AAMatrix_filtered.txt > position.txt
bcftools query -l SCT.recode.vcf
vi sample.id
### 拆分个体脚本
#!/bin/bash
# 输入文件
input_vcf="SCT.recode.vcf"
sample_list="sample.id"
# 读取样本列表
while IFS= read -r sample
do
# 输出文件名
output_vcf="${sample}.vcf"
# 使用 bcftools 拆分 VCF 文件
bcftools view -s "$sample" -Oz -o "${output_vcf}.gz" $input_vcf
# 解压缩生成的 VCF.GZ 文件
gunzip "${output_vcf}.gz"
echo "Generated ${output_vcf}"
done < "$sample_list"
### snakemake批处理脚本
import os
# 读取样本 ID 列表
with open('sample.id') as f:
samples = [line.strip() for line in f]
# 定义规则 all,列出所有需要生成的输出文件
rule all:
input:
expand("{sample}_position.recode.vcf", sample=samples)
# 定义规则,处理每个 VCF 文件
rule process_vcf:
input:
vcf = "{sample}.vcf",
positions = "position.txt"
output:
recoded_vcf = "{sample}_position.recode.vcf"
shell:
"vcftools --vcf {input.vcf} --positions {input.positions} --recode --out {wildcards.sample}_position"
ls *recode.vcf | tr "\t" "\n" | while read id ; do awk '{print $1,$2,$10}' $id | sed "/#/d" > $id.txt ; done
ls *_position.recode.vcf.txt | tr "\t" "\n" | while read id ; do grep "1/1" $id |wc -l ;done
ls *_position.recode.vcf.txt | tr "\t" "\n" | while read id ; do grep "0/1" $id |wc -l ;done
LOF分析
snpEff ref_1 SCT.recode.vcf > SCT.ann
###筛选功能有害突变位点
import re
# 读取样本ID文件
sample_id_file = 'sample.id'
# 读取样本ID列表
with open(sample_id_file, 'r') as f:
sample_ids = [line.strip() for line in f]
# 定义要匹配的关键字
keywords = ["stop_gained", "splice_acceptor_variant", "splice_donor_variant"]
# 定义处理单个文件的函数
def process_file(sample_id):
input_file = f'{sample_id}.ann'
output_file = f'{sample_id}_filtered.ann'
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
# 检查行中是否包含任意一个关键字
if any(keyword in line for keyword in keywords):
outfile.write(line)
print(f'Processed {input_file}, results written to {output_file}')
# 批量处理所有样本文件
for sample_id in sample_ids:
process_file(sample_id)
awk '{print $1,$2}' SCT_filtered.ann > position1.txt
### snakemake脚本
import os
# 读取样本 ID 列表
with open('sample.id') as f:
samples = [line.strip() for line in f]
# 定义规则 all,列出所有需要生成的输出文件
rule all:
input:
expand("{sample}_position.recode.vcf", sample=samples)
# 定义规则,处理每个 VCF 文件
rule process_vcf:
input:
vcf = "{sample}.vcf",
positions = "position1.txt"
output:
recoded_vcf = "{sample}_position.recode.vcf"
shell:
"vcftools --vcf {input.vcf} --positions {input.positions} --recode --out {wildcards.sample}_position"
ls *recode.vcf | tr "\t" "\n" | while read id ; do awk '{print $1,$2,$10}' $id | sed "/#/d" > $id.txt ; done
ls *_position.recode.vcf.txt | tr "\t" "\n" | while read id ; do grep "1/1" $id |wc -l ;done
ls *_position.recode.vcf.txt | tr "\t" "\n" | while read id ; do grep "0/1" $id |wc -l ;done
个体之间平均杂合度计算脚本
import pandas as pd
import itertools
# 读取数据文件
df = pd.read_csv('SCT_het.txt', delim_whitespace=True, header=None, names=['ID', 'Col2', 'Col3', 'Het'])
# 创建一个字典以存储个体的杂合度
het_dict = df.set_index('ID')['Het'].to_dict()
# 获取所有个体ID的列表
individuals = list(het_dict.keys())
# 初始化一个列表来存储所有两两之间杂合度平均值
avg_het_values = []
# 计算所有两两之间杂合度的平均值,并四舍五入保留小数点后6位
for pair in itertools.combinations(individuals, 2):
id1, id2 = pair
avg_het = round((het_dict[id1] + het_dict[id2]) / 2, 6)
avg_het_values.append((id1, id2, avg_het))
# 将结果转换为DataFrame并保存到文件
avg_het_df = pd.DataFrame(avg_het_values, columns=['ID1', 'ID2', 'Avg_Het'])
avg_het_df.to_csv('avg_het_values.txt', sep='\t', index=False)
print("计算完成,结果保存在 avg_het_values.txt 文件中。")
个体之间亲缘系数计算
plink --bfile SCT --genome --min 0 --out kinship
awk '{print $2,$4,$10}' kinship.genome > kinship.txt
个体之间相同位点共同杂合度不同情况统计[1-(2*两个体相同杂合位点数量)/ (个体1杂合位点数+个体2杂合位点数)]
import pandas as pd
# 读取VCF文件
vcf_file = 'SCT.vcf'
# 读取VCF文件内容,跳过头部的注释行
data = pd.read_csv(vcf_file, comment='#', sep='\t')
# 提取样本ID
sample_ids = data.columns[9:]
# 计算每个个体的杂合位点数量
heterozygous_counts = {}
for sample_id in sample_ids:
heterozygous_counts[sample_id] = ((data[sample_id] == '0/1') | (data[sample_id] == '1/0')).sum()
# 初始化结果列表
results = []
# 计算任意两个个体的杂合位点相同数量并计算Avg_Het
for i in range(len(sample_ids)):
for j in range(i + 1, len(sample_ids)):
id1 = sample_ids[i]
id2 = sample_ids[j]
# 计算相同杂合位点数量
same_het_count = ((data[id1] == '0/1') & (data[id2] == '0/1')).sum()
# 计算Avg_Het
avg_het = 1 - (2 * same_het_count) / (heterozygous_counts[id1] + heterozygous_counts[id2])
# 记录结果
results.append([id1, id2, avg_het])
# 转换为DataFrame
results_df = pd.DataFrame(results, columns=['ID1', 'ID2', 'Avg_Het'])
# 输出结果到CSV文件
results_df.to_csv('avg_het_results.csv', index=False)
# 输出结果到屏幕上
print(results_df)