将整个数据处理流程进行汇总
1.通过阅读文献,了解到进行LD修剪要谨慎,建议提高ROH检测的严格标准来纠正由LD引起的假阳性ROH
## 转换为vcf格式
plink --bfile tig4 --export vcf --out test2
PopLDdecay -InVCF test2.vcf -OutStat LDdecay
perl /home/yjc/Software/PopLDdecay-3.40/bin/Plot_OnePop.pl -inFile LDdecay.stat.gz -output Fig
Fig.png
根据这张图我们进行分析:
这张图显示了LD(连锁不平衡)衰减曲线,横轴表示SNP之间的距离(以千碱基为单位,Kb),纵轴表示LD的度量值r²。通过这张图,我们可以观察到LD值随着SNP之间距离的增加而衰减。
初始高LD区域:在图的最左边(0-50 Kb),r²值非常高,接近0.8。这表明在短距离范围内,LD非常强。
快速衰减阶段:在50-100 Kb之间,r²值迅速下降到大约0.4。这表明随着距离增加,LD迅速减弱。
平稳阶段:在100 Kb之后,r²值下降速度减缓,逐渐趋于平稳,在300 Kb处大约降到0.3。这表示在较长距离范围内,LD影响逐渐减少。
我们想要提高ROH检测的严格标准,如果单从增大最小ROH筛选长度来说,在100 Kb之后,r²值趋于平稳,LD的影响进一步减小。为了确保更少的假阳性,可以选择一个大于100 Kb的长度,例如200 Kb或更大。
2.比较Fped与不同质控条件下FROH之间的相关性
MAF0.05
MAF0.01
no_MAF
LD_0.9
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib.backends.backend_pdf import PdfPages
# 导入Excel文件
file_path = 'Fped_Froh0.05_0.01_nomaf_LD2.xlsx'
df = pd.read_excel(file_path)
# 将数据从宽格式转换为长格式,以便使用Seaborn绘图
df_melted = df.melt(id_vars=['ID', 'Fped'], value_vars=['Froh_maf005', 'Froh_maf001', 'Froh_nomaf', 'Froh_LD0.9'],
var_name='Method', value_name='FROH')
# 定义每种方法对应的颜色和标记
palette = {
'Froh_maf005': '#E3371E',
'Froh_maf001': '#FF7A48',
'Froh_nomaf': '#0593A2',
'Froh_LD0.9': '#103778'
}
markers = {
'Froh_maf005': 'X',
'Froh_maf001': '^',
'Froh_nomaf': 'D',
'Froh_LD0.9': 'o'
}
# 计算每种方法的Pearson相关系数
corr_maf005, _ = pearsonr(df['Fped'], df['Froh_maf005'])
corr_maf001, _ = pearsonr(df['Fped'], df['Froh_maf001'])
corr_nomaf, _ = pearsonr(df['Fped'], df['Froh_nomaf'])
corr_LD09, _ = pearsonr(df['Fped'], df['Froh_LD0.9'])
# 创建散点图并添加回归线
plt.figure(figsize=(6, 6))
sns.scatterplot(data=df_melted, x='Fped', y='FROH', hue='Method', style='Method', palette=palette, markers=markers, s=100)
# 添加每种方法的回归线,不显示置信区间
sns.regplot(data=df, x='Fped', y='Froh_maf005', scatter=False, color='#E3371E', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_maf001', scatter=False, color='#FF7A48', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_nomaf', scatter=False, color='#0593A2', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_LD0.9', scatter=False, color='#103778', ci=None)
# 添加 y=x 参考线
plt.plot([0, 0.7], [0, 0.7], linestyle='--', color='black')
# 在图中添加Pearson相关系数文本注释
#plt.text(0.05, 0.65, f'Pearson r (Froh_maf005) = {corr_maf005:.2f}', color='#E3371E')
#plt.text(0.05, 0.60, f'Pearson r (Froh_maf001) = {corr_maf001:.2f}', color='#FF7A48')
#plt.text(0.05, 0.55, f'Pearson r (Froh_nomaf) = {corr_nomaf:.2f}', color='#0593A2')
#plt.text(0.05, 0.50, f'Pearson r (Froh_LD0.9) = {corr_LD09:.2f}', color='#103778')
# 添加图例和标签
plt.legend(title='Method')
plt.xlabel('Fped')
plt.ylabel('FROH')
plt.title('Correlation between Fped and FROH with different methods')
#不显示上方和右侧的坐标轴线
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# 保存图表为PDF文件
pdf_path = 'Correlation_Fped_FROH.pdf'
with PdfPages(pdf_path) as pdf:
pdf.savefig()
# 显示图表
plt.show()
Correlation_Fped_FROH.jpg
根据上图初步判断出过滤MAF0.05得到的相关性值越高
3.利用更多的数据来验证过滤MAF0.05的相关性更强
library(ggplot2)
# 假设你的数据框叫做df,其中包含MAF0.01和MAF0.05两列
# 你可以使用如下代码来读取数据
df <- read.csv("data1.csv")
# 绘制散点图
ggplot(data = df, aes(x = 1:nrow(df))) +
geom_point(aes(y = MAF0.05, color = "MAF > 0.05"), size = 3, alpha = 0.7) +
geom_point(aes(y = MAF0.01, color = "MAF > 0.01"), size = 3, alpha = 0.7) +
geom_point(aes(y = no_MAF, color = "no_MAF"), size = 3, alpha = 0.7) +
geom_point(aes(y = LD, color = "LD"), size = 3, alpha = 0.7) +
scale_color_manual(values = c("MAF > 0.05" = "#E3371E", "MAF > 0.01" = "#FF7A48", "no_MAF" = "#0593A2", "LD" = "#103778")) +
theme_minimal() +
labs(title = "Correlation of FROH with Fped for Different MAF Thresholds",
x = "Sample Index", y = "Correlation Coefficient",
color = "MAF Threshold") +
theme(text = element_text(size = 12),
plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 14)) +
theme_minimal() + # 使用theme_minimal()主题
theme(panel.grid = element_blank(), # 删除网格线
panel.background = element_blank(), # 删除背景
axis.line = element_line(size = 1), # 设置坐标轴线的大小
axis.ticks = element_line(size = 0.5), # 设置坐标轴刻度线的大小
axis.title = element_text(size = 12), # 设置坐标轴标题的字体大小
axis.text = element_text(size = 10)) # 设置坐标轴刻度标签的字体大小
# 显示图形
ggsave("enhanced_scatter_plot.pdf", width = 8, height = 8)
more_test_Fped_4Froh_plot.jpg
通过图片能展示出FROH_MAF0.05过滤标准在大多数情况下能获得与谱系更高的相关性
4.利用不同的过滤标准对亲缘系数进行计算后验证与谱系相关性
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib.backends.backend_pdf import PdfPages
# 导入Excel文件
file_path = 'relation_RPED_Rplink.xlsx'
df = pd.read_excel(file_path)
# 将数据从宽格式转换为长格式,以便使用Seaborn绘图
df_melted = df.melt(id_vars=['relationship', 'Rped'], value_vars=['R_maf0.05', 'R_maf0.01', 'R_nomaf', 'R_LD0.9'],
var_name='Method', value_name='Rgenome')
# 定义每种方法对应的颜色和标记
palette = {
'R_maf0.05': '#E3371E',
'R_maf0.01': '#FF7A48',
'R_nomaf': '#0593A2',
'R_LD0.9': '#103778'
}
markers = {
'R_maf0.05': 'X',
'R_maf0.01': '^',
'R_nomaf': 'D',
'R_LD0.9': 'o'
}
# 计算每种方法的Pearson相关系数
corr_maf005, _ = pearsonr(df['Rped'], df['R_maf0.05'])
corr_maf001, _ = pearsonr(df['Rped'], df['R_maf0.01'])
corr_nomaf, _ = pearsonr(df['Rped'], df['R_nomaf'])
corr_LD09, _ = pearsonr(df['Rped'], df['R_LD0.9'])
# 创建散点图并添加回归线
plt.figure(figsize=(6, 6))
sns.scatterplot(data=df_melted, x='Rped', y='Rgenome', hue='Method', style='Method', palette=palette, markers=markers, s=100)
# 添加每种方法的回归线,不显示置信区间
sns.regplot(data=df, x='Rped', y='R_maf0.05', scatter=False, color='#E3371E', ci=None)
sns.regplot(data=df, x='Rped', y='R_maf0.01', scatter=False, color='#FF7A48', ci=None)
sns.regplot(data=df, x='Rped', y='R_nomaf', scatter=False, color='#0593A2', ci=None)
sns.regplot(data=df, x='Rped', y='R_LD0.9', scatter=False, color='#103778', ci=None)
# 添加 y=x 参考线
plt.plot([0, 0.7], [0, 0.7], linestyle='--', color='black')
# 在图中添加Pearson相关系数文本注释
plt.text(0.05, 0.65, f'Pearson r (R_maf0.05) = {corr_maf005:.2f}', color='#E3371E')
plt.text(0.05, 0.60, f'Pearson r (R_maf0.01) = {corr_maf001:.2f}', color='#FF7A48')
plt.text(0.05, 0.55, f'Pearson r (R_nomaf) = {corr_nomaf:.2f}', color='#0593A2')
plt.text(0.05, 0.50, f'Pearson r (R_LD0.9) = {corr_LD09:.2f}', color='#103778')
# 添加图例和标签
plt.legend(title='Method')
plt.xlabel('Rped')
plt.ylabel('Rgenome')
plt.title('Correlation between Rped and Rgenome with different methods')
#不显示上方和右侧的坐标轴线
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# 保存图表为PDF文件
pdf_path = 'Correlation_Rped_Rgenome.pdf'
with PdfPages(pdf_path) as pdf:
pdf.savefig()
# 显示图表
plt.show()
image.png
5.绘制平面坐标图(可以调整相关性渐变的取值范围)
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize, LinearSegmentedColormap
from pandas.plotting import parallel_coordinates
# 导入数据
data_path = 'wolf_kb.csv' # 请根据您的文件位置调整路径
data = pd.read_csv(data_path)
# Normalize the 'correlation' values for color mapping with specified range 0.5000 to 0.6500
norm = Normalize(vmin=0.5000, vmax=0.6500)
# Create a custom color map from #0593A2 to #FF7A48
colors = ["#0593A2", "#FF7A48"]
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
# Plot the parallel coordinates
fig, ax = plt.subplots(figsize=(8, 6))
parallel_coordinates(data.drop(columns='correlation_norm', errors='ignore'), class_column='correlation', ax=ax,
color=cmap(norm(data['correlation'].values)), alpha=0.5)
# Adjust the plot
plt.xticks(rotation=45, ha='right')
plt.title('Impact of Parameter Variation on Correlation with Colorbar')
# Create colorbar as a legend for the 'correlation'
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label('Correlation')
# Adjust layout
plt.tight_layout()
# Save the updated plot with colorbar as a PDF
colorbar_plt_path = 'colorbar_parallel_coordinates_plot.pdf'
plt.savefig(colorbar_plt_path, bbox_inches='tight')
plt.show()
不调整取值范围
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize, LinearSegmentedColormap
from pandas.plotting import parallel_coordinates
# 导入数据
data_path = 'snp.csv' # 请根据您的文件位置调整路径
data = pd.read_csv(data_path)
# Normalize the 'correlation' values for color mapping
norm = Normalize(vmin=data['correlation'].min(), vmax=data['correlation'].max())
# Create a custom color map from #103778 to #E3371E
colors = ["#0593A2", "#FF7A48"]
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
# Plot the parallel coordinates
fig, ax = plt.subplots(figsize=(8, 6))
parallel_coordinates(data.drop(columns='correlation_norm', errors='ignore'), class_column='correlation', ax=ax,
color=cmap(norm(data['correlation'].values)), alpha=0.5)
# Adjust the plot
plt.xticks(rotation=45, ha='right')
plt.title('Impact of Parameter Variation on Correlation with Colorbar')
# Create colorbar as a legend for the 'correlation'
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label('Correlation')
# Adjust layout
plt.tight_layout()
# Save the updated plot with colorbar as a PDF
colorbar_plt_path = 'colorbar_parallel_coordinates_plot.pdf'
plt.savefig(colorbar_plt_path, bbox_inches='tight')
plt.show()
image.png
跑下面表格中的ROH数据所用的代码
image.png
## 我想得到第二列数据,首先需要将全部将ROH.hom拆分为不同个体的ROH文件,利用拆分脚本,在以前的文章中有,然后我利用一个新的脚本,来判断每个个体文件是否包含每个范围的数据,如果包含那么将文件输出到OUTPUT.TXT文件中。运行之前将所有个体的文件名放入in.put文件中,脚本如下:
#!/bin/bash
# 定义范围的上下限
lower_bound=20000
upper_bound=100000
# 清空或创建 output.TXT 文件
> output.TXT
# 读取 in.put 文件中的每个文件名
while read -r file; do
# 检查文件是否存在
if [ -f "$file" ]; then
echo "Processing file: $file" # 调试信息
# 使用 awk 检查第9列是否包含范围内的值
if awk -v lb=$lower_bound -v ub=$upper_bound 'NR > 1 && $9 >= lb && $9 <= ub {print "Found:", $9; found=1; exit} END {if (!found) exit 1}' "$file"; then
# 如果包含,输出文件名到 output.TXT
echo "$file" >> output.TXT
echo "$file contains values in the range" # 调试信息
else
echo "$file does not contain values in the range" # 调试信息
fi
else
echo "File $file does not exist" # 调试信息
fi
done < in.put
## 我想得到第三列数据,利用代码:
cat ROH.hom | awk 'NR > 1 && $9 >= 500 && $9 <1000' | wc -l
## 我想得到第四列就用第三列除以总ROH数量
## 我想得到第五列,利用第三列的值/第二列的值
## 我想得到第6列
cat ROH.hom | awk 'NR > 1 && $9 >= 500 && $9 <1000' | awk '{sum += $9} END {print sum / 10173}' ## 10173为第三列的值
跑下图表中的数据所用的代码
image.png
### 这个表格是计算不同长度范围的ROH对应的FROH,思路就是将总ROH文件中相应范围的ROH选出来,然后将同一个个体的ROH进行加和处理,让后将加和的总长度以及个体编号输出到一个文件中,在除以基因组总长度。
### 第一个脚本是一个PYTHON脚本,用来生成每个个体的计算命令
#!/usr/bin/env python
inputfile = "sample25000.id" ###样本的编号,不包含txt后缀
outputfile = "mean.sh"
output = open(outputfile,"w")
for line in open(inputfile):
output.write('cat ' + '25000.txt ' + '| ' + 'awk ' + '\'' + '$2'+'=='+line.rstrip()+'\''+ '> ' + line.rstrip() + '.txt' + '\n' + 'awk ' + '\'' + 'BEGIN{sum = 0} {sum += $9} END {print ' + '"' + line.rstrip() + ' "' + 'sum/2258664.721}' + '\' ' + line.rstrip() + '.txt' + ' >> ' + 'mean.txt' + '\n') ### 25000.txt就是从总ROH文件中得范围内的ROH
output.close()
### 这会生成一个sh脚本,里面的内容如下
### sh 脚本的内容
cat 25000.txt | awk '$2==299'> 299.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "299 "sum/2258664.721}' 299.txt > mean.txt ### 这里正常会生成>>,需要自己改成>
cat 25000.txt | awk '$2==363'> 363.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "363 "sum/2258664.721}' 363.txt >> mean.txt
cat 25000.txt | awk '$2==495'> 495.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "495 "sum/2258664.721}' 495.txt >> mean.txt
cat 25000.txt | awk '$2==542'> 542.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "542 "sum/2258664.721}' 542.txt >> mean.txt
cat 25000.txt | awk '$2==550'> 550.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "550 "sum/2258664.721}' 550.txt >> mean.txt
cat 25000.txt | awk '$2==585'> 585.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "585 "sum/2258664.721}' 585.txt >> mean.txt
cat 25000.txt | awk '$2==594'> 594.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "594 "sum/2258664.721}' 594.txt >> mean.txt
cat 25000.txt | awk '$2==605'> 605.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "605 "sum/2258664.721}' 605.txt >> mean.txt
cat 25000.txt | awk '$2==670'> 670.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "670 "sum/2258664.721}' 670.txt >> mean.txt
cat 25000.txt | awk '$2==708'> 708.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "708 "sum/2258664.721}' 708.txt >> mean.txt
cat 25000.txt | awk '$2==720'> 720.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "720 "sum/2258664.721}' 720.txt >> mean.txt
cat 25000.txt | awk '$2==740'> 740.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "740 "sum/2258664.721}' 740.txt >> mean.txt
cat 25000.txt | awk '$2==771'> 771.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "771 "sum/2258664.721}' 771.txt >> mean.txt
# 1.运行sh脚本
# 2.计算平均值和标准差
awk '{print $2}' mean.txt | datamash mean 1 sstdev 1
# 3.求最大值和最小值
cat mean.txt | awk '{print $2}' | sort
cat mean.txt | awk '{print $2}' | sort | head
6.绘制不同染色体ROH数量和平均长度柱状图和折线图
import pandas as pd
import matplotlib.pyplot as plt
# Load the data
file_path = 'count.xlsx'
df = pd.read_excel(file_path)
# Sort the dataframe by Chromosome to ensure order
df = df.sort_values(by='Chromosome')
# Plotting ROH counts with only specified chromosomes displayed
fig, ax1 = plt.subplots(figsize=(10, 6))
color = 'tab:blue'
ax1.set_xlabel('Chromosome')
ax1.set_ylabel('ROH Counts', color=color)
ax1.bar(df['Chromosome'], df['ROH count'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
# Set x-axis ticks to match the chromosome numbers without gaps
ax1.set_xticks(df['Chromosome'])
ax1.set_xticklabels(df['Chromosome'])
# Create a second y-axis for the average ROH length
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Average ROH Length (Mbp)', color=color)
ax2.plot(df['Chromosome'], df['Average ROH length(Mbp)'], color=color, marker='o')
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.title('ROH Counts and Average ROH Length by Chromosome')
colorbar_plt_path = 'ROH_Count_Length.pdf'
plt.savefig(colorbar_plt_path, bbox_inches='tight')
plt.show()
ROH_Count_Length.jpg
7.绘制不同ROH长度计算Froh与谱系之间的相关性
写了一个比较有意思的脚本就是生成Froh文件的个体编号与我原始的个体编号位置不相同。几个例子如下:
我想要的编号顺序是:
1
2
3
4
5
6
结果中的编号顺序为:
2
3
5
1
4
6
我利用脚本将编号顺序变为1,2,3,4,5,6
import pandas as pd
# Load the data from the uploaded files
order_file_path = '1.txt.txt'
data_file_path = '10000up.txt'
# Load the order of IDs
with open(order_file_path, 'r') as file:
order_list = file.read().splitlines()
# Load the data from the 10000FROH.txt file
data_df = pd.read_csv(data_file_path, sep=' ', header=None, names=['ID', 'Value'])
# Ensure both 'ID' columns are of the same type (string)
order_df = pd.DataFrame(order_list, columns=['ID'])
order_df['ID'] = order_df['ID'].astype(str)
data_df['ID'] = data_df['ID'].astype(str)
# Merge the data to follow the order in the order list
merged_df = order_df.merge(data_df, on='ID', how='left')
# Save the resulting DataFrame to a new file
merged_file_path = 'merged_10000up.txt'
merged_df.to_csv(merged_file_path, sep=' ', index=False, header=False)
print(f'Merged file saved to: {merged_file_path}')
绘制相关关系图
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib.backends.backend_pdf import PdfPages
# 导入Excel文件
file_path = 'different_roh.xlsx'
df = pd.read_excel(file_path)
# 将数据从宽格式转换为长格式,以便使用Seaborn绘图
df_melted = df.melt(id_vars=['ID', 'Fped'], value_vars=['Froh', 'Froh_1000', 'Froh_3000', 'Froh_5000', 'Froh_10000', 'Froh_10000up'],
var_name='Method', value_name='FROH')
# 定义每种方法对应的颜色和标记
palette = {
'Froh': '#E3371E',
'Froh_1000': '#FF7A48',
'Froh_3000': '#0593A2',
'Froh_5000': '#103778',
'Froh_10000': '#1EE3B0',
'Froh_10000up': '#48D6FF'
}
markers = {
'Froh': 'X',
'Froh_1000': '^',
'Froh_3000': 'D',
'Froh_5000': 'o',
'Froh_10000': '*',
'Froh_10000up': 'v'
}
# 计算每种方法的Pearson相关系数
corr_Froh, _ = pearsonr(df['Fped'], df['Froh'])
corr_Froh_1000, _ = pearsonr(df['Fped'], df['Froh_1000'])
corr_Froh_3000, _ = pearsonr(df['Fped'], df['Froh_3000'])
corr_Froh_5000, _ = pearsonr(df['Fped'], df['Froh_5000'])
corr_Froh_10000, _ = pearsonr(df['Fped'], df['Froh_10000'])
corr_Froh_10000up, _ = pearsonr(df['Fped'], df['Froh_10000up'])
# 创建散点图并添加回归线
plt.figure(figsize=(6, 6))
sns.scatterplot(data=df_melted, x='Fped', y='FROH', hue='Method', style='Method', palette=palette, markers=markers, s=100)
# 添加每种方法的回归线,不显示置信区间
sns.regplot(data=df, x='Fped', y='Froh', scatter=False, color='#E3371E', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_1000', scatter=False, color='#FF7A48', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_3000', scatter=False, color='#0593A2', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_5000', scatter=False, color='#103778', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_10000', scatter=False, color='#1EE3B0', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_10000up', scatter=False, color='#48D6FF', ci=None)
# 添加 y=x 参考线
plt.plot([0, 0.7], [0, 0.7], linestyle='--', color='black')
# 在图中添加Pearson相关系数文本注释
#plt.text(0.05, 0.65, f'Pearson r (Froh) = {corr_Froh:.2f}', color='black')
#plt.text(0.05, 0.60, f'Pearson r (Froh_1000) = {corr_Froh_1000:.2f}', color='black')
#plt.text(0.05, 0.55, f'Pearson r (Froh_3000) = {corr_Froh_3000:.2f}', color='black')
#plt.text(0.05, 0.50, f'Pearson r (Froh_5000) = {corr_Froh_5000:.2f}', color='black')
#plt.text(0.05, 0.50, f'Pearson r (Froh_10000) = {corr_Froh_10000:.2f}', color='black')
#plt.text(0.05, 0.50, f'Pearson r (Froh_10000up) = {corr_Froh_10000up:.2f}', color='black')
# 添加图例和标签
plt.legend(title='Method')
plt.xlabel('Fped')
plt.ylabel('FROH')
#plt.title('Correlation between Fped and Fgenome with different methods')
#不显示上方和右侧的坐标轴线
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# 保存图表为PDF文件
pdf_path = 'different_ROH.pdf'
with PdfPages(pdf_path) as pdf:
pdf.savefig()
# 显示图表
plt.show()
different_ROH.jpg
8.计算各项遗传多样性参数
1.核苷酸多样性(pi)
## 计算pi值
vcftools --vcf /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome.vcf --window-pi 500000 --window-pi-step 100000 --out SCT_pi
## 筛出pi那一列并删除标题行
awk '{print $5}' SCT_pi.windowed.pi | sed '1d' > A.txt
## 排序,按照从大到小
sort -n -r A.txt > B.txt
## 计算平均数和标准差
awk '{print $1}' A.txt | datamash mean 1 sstdev 1 > SCT_mean_pi.txt
## 筛选出第一列和第5列(去掉首行)
cat SCT_pi.windowed.pi | awk 'NR > 1' | awk '{print $1,$5}' > sct_pi.txt
数据转换
## 将数据转换为4列,为满足绘制曼哈顿图的需求
import pandas as pd
import numpy as np
# 读取数据文件
file_path = 'sct_pi.txt' # 请确保sct_pi.txt文件与脚本在同一目录下,或者提供完整路径
data = pd.read_csv(file_path, sep=" ", header=None, names=['chromosome', 'p_value'])
# 生成位置数据
data['position'] = data.groupby('chromosome').cumcount()
# 计算-log10(p_value)
data['-log10(p_value)'] = -np.log10(data['p_value'])
# 重排列顺序
data = data[['chromosome', 'position', 'p_value', '-log10(p_value)']]
# 保存结果到文件
output_file_path = 'SCT_Pi_manhadun.txt' # 输出文件名
data.to_csv(output_file_path, sep="\t", index=False)
print(f"数据处理完成,结果已保存到 {output_file_path}")
绘制曼哈顿图
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
# 读取处理后的数据
file_path = 'SCT_Pi_manhadun.txt' # 请确保文件与脚本在同一目录下,或者提供完整路径
data = pd.read_csv(file_path, sep="\t")
# 获取染色体编号
chromosomes = sorted(data['chromosome'].unique())
# 计算每个染色体的累积位置
data['cumulative_pos'] = data.groupby('chromosome').cumcount()
offset = 0
chromosome_offsets = {}
for chrom in chromosomes:
chromosome_offsets[chrom] = offset
offset += data[data['chromosome'] == chrom]['cumulative_pos'].max() + 1
data['adjusted_pos'] = data.apply(lambda row: row['cumulative_pos'] + chromosome_offsets[row['chromosome']], axis=1)
# 绘制曼哈顿图
fig, ax = plt.subplots(figsize=(8, 4))
# 为每个染色体赋予不同的颜色
colors = plt.cm.tab20.colors
# 绘制每个染色体的数据
for i, chrom in enumerate(chromosomes):
chrom_data = data[data['chromosome'] == chrom]
ax.scatter(chrom_data['adjusted_pos'], chrom_data['-log10(p_value)'], color=colors[i % len(colors)], s=2, label=f'Chromosome {chrom}' if i == 0 else "")
# 添加参考线
ax.axhline(y=3.75, color='r', linestyle='--')
ax.axhline(y=5, color='g', linestyle='--')
# 设置标签和标题
ax.set_xlabel('Chromosome')
ax.set_ylabel('-log10(p_value)')
ax.set_title('Manhattan Plot')
# 设置x轴刻度
chromosome_labels = []
tick_positions = []
for chrom in chromosomes:
midpoint = (chromosome_offsets[chrom] + chromosome_offsets[chrom] + data[data['chromosome'] == chrom]['cumulative_pos'].max()) // 2
tick_positions.append(midpoint)
chromosome_labels.append(chrom)
ax.set_xticks(tick_positions)
ax.set_xticklabels(chromosome_labels)
# 保存图表为PDF文件
pdf_path = 'Pi_manhadun.pdf'
with PdfPages(pdf_path) as pdf:
pdf.savefig()
# 显示图例
#ax.legend()
#plt.show()
Pi_manhadun.png
2.PIC多态信息含量
-
多态信息含量(Polymorphic Information Content,PIC)是用来衡量一个标记(如SNP位点)的多态性以及其在群体中提供的信息量的指标。PIC值反映了一个标记在一个群体中区分不同基因型的能力。通常,PIC值越高,表示该标记在群体中的多态性越高,信息量也越大。*
image.png
### 计算maf
plink --file /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome --freq --out sct_allele_freq
利用R脚本计算PIC
# 读取等位基因频率文件
freq_data <- read.table("sct_allele_freq.frq", header = TRUE)
# 计算PIC值函数
calculate_pic <- function(p) {
q <- 1 - p
pic <- 1 - (p^2 + q^2) - 2 * p^2 * q^2
return(pic)
}
# 提取等位基因频率
freq_data$MAF <- as.numeric(freq_data$MAF)
freq_data$PIC <- sapply(freq_data$MAF, calculate_pic)
# 保存结果
write.table(freq_data, file = "allele_freq_with_PIC.txt", row.names = FALSE, quote = FALSE)
awk '{print $1,$7}' allele_freq_with_PIC.txt | sed "1d" > PIC.txt
awk '{print $2}' PIC.txt | datamash mean 1 sstdev 1 > PIC_mean.txt
3.Ho和He计算
plink --file /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome --hardy --out SCT
awk '{print $7 "\t" $8}' SCT.hwe | sed '1d' > SCT_het.txt
awk '{print $1}' SCT_het.txt | datamash mean 1 sstdev 1 > SCT_Ho.txt
awk '{print $2}' SCT_het.txt | datamash mean 1 sstdev 1 > SCT_He.txt
4.Nei多样性指数
Nei多样性指数(Nei's diversity index),也称为Nei's基因多样性,是衡量一个群体的遗传多样性的重要指标。由Nei(1973年)提出,它主要用于估算群体内的基因变异程度,特别是在分子进化和群体遗传学研究中广泛应用。
image.png
# 读取等位基因频率文件
freq_data <- read.table("../PIC/sct_allele_freq.frq", header = TRUE)
# 查看频率文件的结构
str(freq_data)
# 计算Nei多样性指数的函数
calculate_nei <- function(p) {
q <- 1 - p
H <- 1 - (p^2 + q^2)
return(H)
}
# 提取等位基因频率并计算Nei多样性指数
freq_data$MAF <- as.numeric(freq_data$MAF)
freq_data$Nei_H <- sapply(freq_data$MAF, calculate_nei)
# 保存结果
write.table(freq_data, file = "allele_freq_with_Nei_H.txt", row.names = FALSE, quote = FALSE)
# 显示结果
head(freq_data)
5.香浓维纳指数
香农-维纳指数(Shannon-Wiener Index)确实可以用于反映遗传多样性,尽管它在生态学中更常用于衡量物种多样性。遗传多样性是指一个群体中存在的遗传变异程度,而香农-维纳指数能够通过考虑等位基因的丰富度和均匀度来度量这种变异。
# 读取等位基因频率文件
freq_data <- read.table("../PIC/sct_allele_freq.frq", header = TRUE)
# 查看频率文件的结构
str(freq_data)
# 计算香农-维纳指数的函数
calculate_shannon <- function(p) {
q <- 1 - p
H <- - (p * log(p) + q * log(q))
return(H)
}
# 去除NA值
freq_data <- freq_data[!is.na(freq_data$MAF), ]
# 提取等位基因频率并计算香农-维纳指数
freq_data$Shannon_H <- sapply(freq_data$MAF, calculate_shannon)
# 保存结果
write.table(freq_data, file = "allele_freq_with_Shannon_H.txt", row.names = FALSE, quote = FALSE)
6.HOM/HET
基因组中的纯合位点数比上基因组中的杂合位点数
首先需要利用plink计算杂合位点数
vcftools --vcf /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome.vcf --het --out SCT.het
得到SCT.het.het文件后,在利用python脚本计算比值
# 打开输入文件和输出文件
with open('SCT.het.het', 'r') as infile, open('HOM_HET.txt', 'w') as outfile:
# 读取文件的第一行(列名)
header = infile.readline().strip().split()
# 写入输出文件的列名
outfile.write('ID\tO(HOM)\tO(HET)\tHOM/HET\n')
# 处理每一行数据
for line in infile:
data = line.strip().split()
# 获取ID、O(HOM)和N_SITES
ID = data[0]
O_HOM = int(data[1])
N_SITES = int(data[3])
# 计算O(HET)和HOM/HET
O_HET = N_SITES - O_HOM
HOM_HET_ratio = O_HOM / O_HET if O_HET != 0 else 'Inf'
# 写入输出文件
outfile.write(f'{ID}\t{O_HOM}\t{O_HET}\t{HOM_HET_ratio}\n')
7.TajimanD
vcftools --vcf ../../../var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome.vcf --TajimaD 500000 --out SCT_TajimaD
8.IBD的计算以及计算比例
计算IBD
plink --bfile ../../../var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome --genome --out ibd
利用R脚本计算比例
# 读取PLINK输出的genome文件
ibd_data <- read.table("ibd.genome", header = TRUE)
# 计算IBD比例的平均值
mean_ibd <- mean(ibd_data$PI_HAT)
# 创建一个包含结果的字符向量
result <- c("平均IBD比例:", mean_ibd)
# 打开文件并写入结果
writeLines(result, con = "result.txt")
# 保存所有个体对的IBD结果到文件
write.table(ibd_data, file = "ibd_result.txt", row.names = FALSE, quote = FALSE, sep = "\t")