将整个数据处理流程进行汇总
1.通过阅读文献,了解到进行LD修剪要谨慎,建议提高ROH检测的严格标准来纠正由LD引起的假阳性ROH
## 转换为vcf格式
plink --bfile tig4 --export vcf --out test2
PopLDdecay -InVCF test2.vcf -OutStat LDdecay
perl /home/yjc/Software/PopLDdecay-3.40/bin/Plot_OnePop.pl -inFile LDdecay.stat.gz -output Fig
根据这张图我们进行分析:
这张图显示了LD(连锁不平衡)衰减曲线,横轴表示SNP之间的距离(以千碱基为单位,Kb),纵轴表示LD的度量值r²。通过这张图,我们可以观察到LD值随着SNP之间距离的增加而衰减。
初始高LD区域:在图的最左边(0-50 Kb),r²值非常高,接近0.8。这表明在短距离范围内,LD非常强。
快速衰减阶段:在50-100 Kb之间,r²值迅速下降到大约0.4。这表明随着距离增加,LD迅速减弱。
平稳阶段:在100 Kb之后,r²值下降速度减缓,逐渐趋于平稳,在300 Kb处大约降到0.3。这表示在较长距离范围内,LD影响逐渐减少。
我们想要提高ROH检测的严格标准,如果单从增大最小ROH筛选长度来说,在100 Kb之后,r²值趋于平稳,LD的影响进一步减小。为了确保更少的假阳性,可以选择一个大于100 Kb的长度,例如200 Kb或更大。
2.比较Fped与不同质控条件下FROH之间的相关性
MAF0.05
MAF0.01
no_MAF
LD_0.9
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib.backends.backend_pdf import PdfPages
# 导入Excel文件
file_path = 'Fped_Froh0.05_0.01_nomaf_LD2.xlsx'
df = pd.read_excel(file_path)
# 将数据从宽格式转换为长格式,以便使用Seaborn绘图
df_melted = df.melt(id_vars=['ID', 'Fped'], value_vars=['Froh_maf005', 'Froh_maf001', 'Froh_nomaf', 'Froh_LD0.9'],
var_name='Method', value_name='FROH')
# 定义每种方法对应的颜色和标记
palette = {
'Froh_maf005': '#E3371E',
'Froh_maf001': '#FF7A48',
'Froh_nomaf': '#0593A2',
'Froh_LD0.9': '#103778'
}
markers = {
'Froh_maf005': 'X',
'Froh_maf001': '^',
'Froh_nomaf': 'D',
'Froh_LD0.9': 'o'
}
# 计算每种方法的Pearson相关系数
corr_maf005, _ = pearsonr(df['Fped'], df['Froh_maf005'])
corr_maf001, _ = pearsonr(df['Fped'], df['Froh_maf001'])
corr_nomaf, _ = pearsonr(df['Fped'], df['Froh_nomaf'])
corr_LD09, _ = pearsonr(df['Fped'], df['Froh_LD0.9'])
# 创建散点图并添加回归线
plt.figure(figsize=(6, 6))
sns.scatterplot(data=df_melted, x='Fped', y='FROH', hue='Method', style='Method', palette=palette, markers=markers, s=100)
# 添加每种方法的回归线,不显示置信区间
sns.regplot(data=df, x='Fped', y='Froh_maf005', scatter=False, color='#E3371E', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_maf001', scatter=False, color='#FF7A48', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_nomaf', scatter=False, color='#0593A2', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_LD0.9', scatter=False, color='#103778', ci=None)
# 添加 y=x 参考线
plt.plot([0, 0.7], [0, 0.7], linestyle='--', color='black')
# 在图中添加Pearson相关系数文本注释
#plt.text(0.05, 0.65, f'Pearson r (Froh_maf005) = {corr_maf005:.2f}', color='#E3371E')
#plt.text(0.05, 0.60, f'Pearson r (Froh_maf001) = {corr_maf001:.2f}', color='#FF7A48')
#plt.text(0.05, 0.55, f'Pearson r (Froh_nomaf) = {corr_nomaf:.2f}', color='#0593A2')
#plt.text(0.05, 0.50, f'Pearson r (Froh_LD0.9) = {corr_LD09:.2f}', color='#103778')
# 添加图例和标签
plt.legend(title='Method')
plt.xlabel('Fped')
plt.ylabel('FROH')
plt.title('Correlation between Fped and FROH with different methods')
#不显示上方和右侧的坐标轴线
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# 保存图表为PDF文件
pdf_path = 'Correlation_Fped_FROH.pdf'
with PdfPages(pdf_path) as pdf:
pdf.savefig()
# 显示图表
plt.show()
根据上图初步判断出过滤MAF0.05得到的相关性值越高
3.利用更多的数据来验证过滤MAF0.05的相关性更强
library(ggplot2)
# 假设你的数据框叫做df,其中包含MAF0.01和MAF0.05两列
# 你可以使用如下代码来读取数据
df <- read.csv("data1.csv")
# 绘制散点图
ggplot(data = df, aes(x = 1:nrow(df))) +
geom_point(aes(y = MAF0.05, color = "MAF > 0.05"), size = 3, alpha = 0.7) +
geom_point(aes(y = MAF0.01, color = "MAF > 0.01"), size = 3, alpha = 0.7) +
geom_point(aes(y = no_MAF, color = "no_MAF"), size = 3, alpha = 0.7) +
geom_point(aes(y = LD, color = "LD"), size = 3, alpha = 0.7) +
scale_color_manual(values = c("MAF > 0.05" = "#E3371E", "MAF > 0.01" = "#FF7A48", "no_MAF" = "#0593A2", "LD" = "#103778")) +
theme_minimal() +
labs(title = "Correlation of FROH with Fped for Different MAF Thresholds",
x = "Sample Index", y = "Correlation Coefficient",
color = "MAF Threshold") +
theme(text = element_text(size = 12),
plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 14)) +
theme_minimal() + # 使用theme_minimal()主题
theme(panel.grid = element_blank(), # 删除网格线
panel.background = element_blank(), # 删除背景
axis.line = element_line(size = 1), # 设置坐标轴线的大小
axis.ticks = element_line(size = 0.5), # 设置坐标轴刻度线的大小
axis.title = element_text(size = 12), # 设置坐标轴标题的字体大小
axis.text = element_text(size = 10)) # 设置坐标轴刻度标签的字体大小
# 显示图形
ggsave("enhanced_scatter_plot.pdf", width = 8, height = 8)
通过图片能展示出FROH_MAF0.05过滤标准在大多数情况下能获得与谱系更高的相关性
4.利用不同的过滤标准对亲缘系数进行计算后验证与谱系相关性
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib.backends.backend_pdf import PdfPages
# 导入Excel文件
file_path = 'relation_RPED_Rplink.xlsx'
df = pd.read_excel(file_path)
# 将数据从宽格式转换为长格式,以便使用Seaborn绘图
df_melted = df.melt(id_vars=['relationship', 'Rped'], value_vars=['R_maf0.05', 'R_maf0.01', 'R_nomaf', 'R_LD0.9'],
var_name='Method', value_name='Rgenome')
# 定义每种方法对应的颜色和标记
palette = {
'R_maf0.05': '#E3371E',
'R_maf0.01': '#FF7A48',
'R_nomaf': '#0593A2',
'R_LD0.9': '#103778'
}
markers = {
'R_maf0.05': 'X',
'R_maf0.01': '^',
'R_nomaf': 'D',
'R_LD0.9': 'o'
}
# 计算每种方法的Pearson相关系数
corr_maf005, _ = pearsonr(df['Rped'], df['R_maf0.05'])
corr_maf001, _ = pearsonr(df['Rped'], df['R_maf0.01'])
corr_nomaf, _ = pearsonr(df['Rped'], df['R_nomaf'])
corr_LD09, _ = pearsonr(df['Rped'], df['R_LD0.9'])
# 创建散点图并添加回归线
plt.figure(figsize=(6, 6))
sns.scatterplot(data=df_melted, x='Rped', y='Rgenome', hue='Method', style='Method', palette=palette, markers=markers, s=100)
# 添加每种方法的回归线,不显示置信区间
sns.regplot(data=df, x='Rped', y='R_maf0.05', scatter=False, color='#E3371E', ci=None)
sns.regplot(data=df, x='Rped', y='R_maf0.01', scatter=False, color='#FF7A48', ci=None)
sns.regplot(data=df, x='Rped', y='R_nomaf', scatter=False, color='#0593A2', ci=None)
sns.regplot(data=df, x='Rped', y='R_LD0.9', scatter=False, color='#103778', ci=None)
# 添加 y=x 参考线
plt.plot([0, 0.7], [0, 0.7], linestyle='--', color='black')
# 在图中添加Pearson相关系数文本注释
plt.text(0.05, 0.65, f'Pearson r (R_maf0.05) = {corr_maf005:.2f}', color='#E3371E')
plt.text(0.05, 0.60, f'Pearson r (R_maf0.01) = {corr_maf001:.2f}', color='#FF7A48')
plt.text(0.05, 0.55, f'Pearson r (R_nomaf) = {corr_nomaf:.2f}', color='#0593A2')
plt.text(0.05, 0.50, f'Pearson r (R_LD0.9) = {corr_LD09:.2f}', color='#103778')
# 添加图例和标签
plt.legend(title='Method')
plt.xlabel('Rped')
plt.ylabel('Rgenome')
plt.title('Correlation between Rped and Rgenome with different methods')
#不显示上方和右侧的坐标轴线
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# 保存图表为PDF文件
pdf_path = 'Correlation_Rped_Rgenome.pdf'
with PdfPages(pdf_path) as pdf:
pdf.savefig()
# 显示图表
plt.show()
5.绘制平面坐标图(可以调整相关性渐变的取值范围)
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize, LinearSegmentedColormap
from pandas.plotting import parallel_coordinates
# 导入数据
data_path = 'wolf_kb.csv' # 请根据您的文件位置调整路径
data = pd.read_csv(data_path)
# Normalize the 'correlation' values for color mapping with specified range 0.5000 to 0.6500
norm = Normalize(vmin=0.5000, vmax=0.6500)
# Create a custom color map from #0593A2 to #FF7A48
colors = ["#0593A2", "#FF7A48"]
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
# Plot the parallel coordinates
fig, ax = plt.subplots(figsize=(8, 6))
parallel_coordinates(data.drop(columns='correlation_norm', errors='ignore'), class_column='correlation', ax=ax,
color=cmap(norm(data['correlation'].values)), alpha=0.5)
# Adjust the plot
plt.xticks(rotation=45, ha='right')
plt.title('Impact of Parameter Variation on Correlation with Colorbar')
# Create colorbar as a legend for the 'correlation'
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label('Correlation')
# Adjust layout
plt.tight_layout()
# Save the updated plot with colorbar as a PDF
colorbar_plt_path = 'colorbar_parallel_coordinates_plot.pdf'
plt.savefig(colorbar_plt_path, bbox_inches='tight')
plt.show()
不调整取值范围
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import cm
from matplotlib.colors import Normalize, LinearSegmentedColormap
from pandas.plotting import parallel_coordinates
# 导入数据
data_path = 'snp.csv' # 请根据您的文件位置调整路径
data = pd.read_csv(data_path)
# Normalize the 'correlation' values for color mapping
norm = Normalize(vmin=data['correlation'].min(), vmax=data['correlation'].max())
# Create a custom color map from #103778 to #E3371E
colors = ["#0593A2", "#FF7A48"]
cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
# Plot the parallel coordinates
fig, ax = plt.subplots(figsize=(8, 6))
parallel_coordinates(data.drop(columns='correlation_norm', errors='ignore'), class_column='correlation', ax=ax,
color=cmap(norm(data['correlation'].values)), alpha=0.5)
# Adjust the plot
plt.xticks(rotation=45, ha='right')
plt.title('Impact of Parameter Variation on Correlation with Colorbar')
# Create colorbar as a legend for the 'correlation'
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label('Correlation')
# Adjust layout
plt.tight_layout()
# Save the updated plot with colorbar as a PDF
colorbar_plt_path = 'colorbar_parallel_coordinates_plot.pdf'
plt.savefig(colorbar_plt_path, bbox_inches='tight')
plt.show()
跑下面表格中的ROH数据所用的代码
## 我想得到第二列数据,首先需要将全部将ROH.hom拆分为不同个体的ROH文件,利用拆分脚本,在以前的文章中有,然后我利用一个新的脚本,来判断每个个体文件是否包含每个范围的数据,如果包含那么将文件输出到OUTPUT.TXT文件中。运行之前将所有个体的文件名放入in.put文件中,脚本如下:
#!/bin/bash
# 定义范围的上下限
lower_bound=20000
upper_bound=100000
# 清空或创建 output.TXT 文件
> output.TXT
# 读取 in.put 文件中的每个文件名
while read -r file; do
# 检查文件是否存在
if [ -f "$file" ]; then
echo "Processing file: $file" # 调试信息
# 使用 awk 检查第9列是否包含范围内的值
if awk -v lb=$lower_bound -v ub=$upper_bound 'NR > 1 && $9 >= lb && $9 <= ub {print "Found:", $9; found=1; exit} END {if (!found) exit 1}' "$file"; then
# 如果包含,输出文件名到 output.TXT
echo "$file" >> output.TXT
echo "$file contains values in the range" # 调试信息
else
echo "$file does not contain values in the range" # 调试信息
fi
else
echo "File $file does not exist" # 调试信息
fi
done < in.put
## 我想得到第三列数据,利用代码:
cat ROH.hom | awk 'NR > 1 && $9 >= 500 && $9 <1000' | wc -l
## 我想得到第四列就用第三列除以总ROH数量
## 我想得到第五列,利用第三列的值/第二列的值
## 我想得到第6列
cat ROH.hom | awk 'NR > 1 && $9 >= 500 && $9 <1000' | awk '{sum += $9} END {print sum / 10173}' ## 10173为第三列的值
跑下图表中的数据所用的代码
### 这个表格是计算不同长度范围的ROH对应的FROH,思路就是将总ROH文件中相应范围的ROH选出来,然后将同一个个体的ROH进行加和处理,让后将加和的总长度以及个体编号输出到一个文件中,在除以基因组总长度。
### 第一个脚本是一个PYTHON脚本,用来生成每个个体的计算命令
#!/usr/bin/env python
inputfile = "sample25000.id" ###样本的编号,不包含txt后缀
outputfile = "mean.sh"
output = open(outputfile,"w")
for line in open(inputfile):
output.write('cat ' + '25000.txt ' + '| ' + 'awk ' + '\'' + '$2'+'=='+line.rstrip()+'\''+ '> ' + line.rstrip() + '.txt' + '\n' + 'awk ' + '\'' + 'BEGIN{sum = 0} {sum += $9} END {print ' + '"' + line.rstrip() + ' "' + 'sum/2258664.721}' + '\' ' + line.rstrip() + '.txt' + ' >> ' + 'mean.txt' + '\n') ### 25000.txt就是从总ROH文件中得范围内的ROH
output.close()
### 这会生成一个sh脚本,里面的内容如下
### sh 脚本的内容
cat 25000.txt | awk '$2==299'> 299.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "299 "sum/2258664.721}' 299.txt > mean.txt ### 这里正常会生成>>,需要自己改成>
cat 25000.txt | awk '$2==363'> 363.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "363 "sum/2258664.721}' 363.txt >> mean.txt
cat 25000.txt | awk '$2==495'> 495.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "495 "sum/2258664.721}' 495.txt >> mean.txt
cat 25000.txt | awk '$2==542'> 542.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "542 "sum/2258664.721}' 542.txt >> mean.txt
cat 25000.txt | awk '$2==550'> 550.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "550 "sum/2258664.721}' 550.txt >> mean.txt
cat 25000.txt | awk '$2==585'> 585.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "585 "sum/2258664.721}' 585.txt >> mean.txt
cat 25000.txt | awk '$2==594'> 594.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "594 "sum/2258664.721}' 594.txt >> mean.txt
cat 25000.txt | awk '$2==605'> 605.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "605 "sum/2258664.721}' 605.txt >> mean.txt
cat 25000.txt | awk '$2==670'> 670.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "670 "sum/2258664.721}' 670.txt >> mean.txt
cat 25000.txt | awk '$2==708'> 708.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "708 "sum/2258664.721}' 708.txt >> mean.txt
cat 25000.txt | awk '$2==720'> 720.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "720 "sum/2258664.721}' 720.txt >> mean.txt
cat 25000.txt | awk '$2==740'> 740.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "740 "sum/2258664.721}' 740.txt >> mean.txt
cat 25000.txt | awk '$2==771'> 771.txt
awk 'BEGIN{sum = 0} {sum += $9} END {print "771 "sum/2258664.721}' 771.txt >> mean.txt
# 1.运行sh脚本
# 2.计算平均值和标准差
awk '{print $2}' mean.txt | datamash mean 1 sstdev 1
# 3.求最大值和最小值
cat mean.txt | awk '{print $2}' | sort
cat mean.txt | awk '{print $2}' | sort | head
6.绘制不同染色体ROH数量和平均长度柱状图和折线图
import pandas as pd
import matplotlib.pyplot as plt
# Load the data
file_path = 'count.xlsx'
df = pd.read_excel(file_path)
# Sort the dataframe by Chromosome to ensure order
df = df.sort_values(by='Chromosome')
# Plotting ROH counts with only specified chromosomes displayed
fig, ax1 = plt.subplots(figsize=(10, 6))
color = 'tab:blue'
ax1.set_xlabel('Chromosome')
ax1.set_ylabel('ROH Counts', color=color)
ax1.bar(df['Chromosome'], df['ROH count'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
# Set x-axis ticks to match the chromosome numbers without gaps
ax1.set_xticks(df['Chromosome'])
ax1.set_xticklabels(df['Chromosome'])
# Create a second y-axis for the average ROH length
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Average ROH Length (Mbp)', color=color)
ax2.plot(df['Chromosome'], df['Average ROH length(Mbp)'], color=color, marker='o')
ax2.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.title('ROH Counts and Average ROH Length by Chromosome')
colorbar_plt_path = 'ROH_Count_Length.pdf'
plt.savefig(colorbar_plt_path, bbox_inches='tight')
plt.show()
7.绘制不同ROH长度计算Froh与谱系之间的相关性
写了一个比较有意思的脚本就是生成Froh文件的个体编号与我原始的个体编号位置不相同。几个例子如下:
我想要的编号顺序是:
1
2
3
4
5
6
结果中的编号顺序为:
2
3
5
1
4
6
我利用脚本将编号顺序变为1,2,3,4,5,6
import pandas as pd
# Load the data from the uploaded files
order_file_path = '1.txt.txt'
data_file_path = '10000up.txt'
# Load the order of IDs
with open(order_file_path, 'r') as file:
order_list = file.read().splitlines()
# Load the data from the 10000FROH.txt file
data_df = pd.read_csv(data_file_path, sep=' ', header=None, names=['ID', 'Value'])
# Ensure both 'ID' columns are of the same type (string)
order_df = pd.DataFrame(order_list, columns=['ID'])
order_df['ID'] = order_df['ID'].astype(str)
data_df['ID'] = data_df['ID'].astype(str)
# Merge the data to follow the order in the order list
merged_df = order_df.merge(data_df, on='ID', how='left')
# Save the resulting DataFrame to a new file
merged_file_path = 'merged_10000up.txt'
merged_df.to_csv(merged_file_path, sep=' ', index=False, header=False)
print(f'Merged file saved to: {merged_file_path}')
绘制相关关系图
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib.backends.backend_pdf import PdfPages
# 导入Excel文件
file_path = 'different_roh.xlsx'
df = pd.read_excel(file_path)
# 将数据从宽格式转换为长格式,以便使用Seaborn绘图
df_melted = df.melt(id_vars=['ID', 'Fped'], value_vars=['Froh', 'Froh_1000', 'Froh_3000', 'Froh_5000', 'Froh_10000', 'Froh_10000up'],
var_name='Method', value_name='FROH')
# 定义每种方法对应的颜色和标记
palette = {
'Froh': '#E3371E',
'Froh_1000': '#FF7A48',
'Froh_3000': '#0593A2',
'Froh_5000': '#103778',
'Froh_10000': '#1EE3B0',
'Froh_10000up': '#48D6FF'
}
markers = {
'Froh': 'X',
'Froh_1000': '^',
'Froh_3000': 'D',
'Froh_5000': 'o',
'Froh_10000': '*',
'Froh_10000up': 'v'
}
# 计算每种方法的Pearson相关系数
corr_Froh, _ = pearsonr(df['Fped'], df['Froh'])
corr_Froh_1000, _ = pearsonr(df['Fped'], df['Froh_1000'])
corr_Froh_3000, _ = pearsonr(df['Fped'], df['Froh_3000'])
corr_Froh_5000, _ = pearsonr(df['Fped'], df['Froh_5000'])
corr_Froh_10000, _ = pearsonr(df['Fped'], df['Froh_10000'])
corr_Froh_10000up, _ = pearsonr(df['Fped'], df['Froh_10000up'])
# 创建散点图并添加回归线
plt.figure(figsize=(6, 6))
sns.scatterplot(data=df_melted, x='Fped', y='FROH', hue='Method', style='Method', palette=palette, markers=markers, s=100)
# 添加每种方法的回归线,不显示置信区间
sns.regplot(data=df, x='Fped', y='Froh', scatter=False, color='#E3371E', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_1000', scatter=False, color='#FF7A48', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_3000', scatter=False, color='#0593A2', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_5000', scatter=False, color='#103778', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_10000', scatter=False, color='#1EE3B0', ci=None)
sns.regplot(data=df, x='Fped', y='Froh_10000up', scatter=False, color='#48D6FF', ci=None)
# 添加 y=x 参考线
plt.plot([0, 0.7], [0, 0.7], linestyle='--', color='black')
# 在图中添加Pearson相关系数文本注释
#plt.text(0.05, 0.65, f'Pearson r (Froh) = {corr_Froh:.2f}', color='black')
#plt.text(0.05, 0.60, f'Pearson r (Froh_1000) = {corr_Froh_1000:.2f}', color='black')
#plt.text(0.05, 0.55, f'Pearson r (Froh_3000) = {corr_Froh_3000:.2f}', color='black')
#plt.text(0.05, 0.50, f'Pearson r (Froh_5000) = {corr_Froh_5000:.2f}', color='black')
#plt.text(0.05, 0.50, f'Pearson r (Froh_10000) = {corr_Froh_10000:.2f}', color='black')
#plt.text(0.05, 0.50, f'Pearson r (Froh_10000up) = {corr_Froh_10000up:.2f}', color='black')
# 添加图例和标签
plt.legend(title='Method')
plt.xlabel('Fped')
plt.ylabel('FROH')
#plt.title('Correlation between Fped and Fgenome with different methods')
#不显示上方和右侧的坐标轴线
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
# 保存图表为PDF文件
pdf_path = 'different_ROH.pdf'
with PdfPages(pdf_path) as pdf:
pdf.savefig()
# 显示图表
plt.show()
8.计算各项遗传多样性参数
1.核苷酸多样性(pi)
## 计算pi值
vcftools --vcf /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome.vcf --window-pi 500000 --window-pi-step 100000 --out SCT_pi
## 筛出pi那一列并删除标题行
awk '{print $5}' SCT_pi.windowed.pi | sed '1d' > A.txt
## 排序,按照从大到小
sort -n -r A.txt > B.txt
## 计算平均数和标准差
awk '{print $1}' A.txt | datamash mean 1 sstdev 1 > SCT_mean_pi.txt
## 筛选出第一列和第5列(去掉首行)
cat SCT_pi.windowed.pi | awk 'NR > 1' | awk '{print $1,$5}' > sct_pi.txt
数据转换
## 将数据转换为4列,为满足绘制曼哈顿图的需求
import pandas as pd
import numpy as np
# 读取数据文件
file_path = 'sct_pi.txt' # 请确保sct_pi.txt文件与脚本在同一目录下,或者提供完整路径
data = pd.read_csv(file_path, sep=" ", header=None, names=['chromosome', 'p_value'])
# 生成位置数据
data['position'] = data.groupby('chromosome').cumcount()
# 计算-log10(p_value)
data['-log10(p_value)'] = -np.log10(data['p_value'])
# 重排列顺序
data = data[['chromosome', 'position', 'p_value', '-log10(p_value)']]
# 保存结果到文件
output_file_path = 'SCT_Pi_manhadun.txt' # 输出文件名
data.to_csv(output_file_path, sep="\t", index=False)
print(f"数据处理完成,结果已保存到 {output_file_path}")
绘制曼哈顿图
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
# 读取处理后的数据
file_path = 'SCT_Pi_manhadun.txt' # 请确保文件与脚本在同一目录下,或者提供完整路径
data = pd.read_csv(file_path, sep="\t")
# 获取染色体编号
chromosomes = sorted(data['chromosome'].unique())
# 计算每个染色体的累积位置
data['cumulative_pos'] = data.groupby('chromosome').cumcount()
offset = 0
chromosome_offsets = {}
for chrom in chromosomes:
chromosome_offsets[chrom] = offset
offset += data[data['chromosome'] == chrom]['cumulative_pos'].max() + 1
data['adjusted_pos'] = data.apply(lambda row: row['cumulative_pos'] + chromosome_offsets[row['chromosome']], axis=1)
# 绘制曼哈顿图
fig, ax = plt.subplots(figsize=(8, 4))
# 为每个染色体赋予不同的颜色
colors = plt.cm.tab20.colors
# 绘制每个染色体的数据
for i, chrom in enumerate(chromosomes):
chrom_data = data[data['chromosome'] == chrom]
ax.scatter(chrom_data['adjusted_pos'], chrom_data['-log10(p_value)'], color=colors[i % len(colors)], s=2, label=f'Chromosome {chrom}' if i == 0 else "")
# 添加参考线
ax.axhline(y=3.75, color='r', linestyle='--')
ax.axhline(y=5, color='g', linestyle='--')
# 设置标签和标题
ax.set_xlabel('Chromosome')
ax.set_ylabel('-log10(p_value)')
ax.set_title('Manhattan Plot')
# 设置x轴刻度
chromosome_labels = []
tick_positions = []
for chrom in chromosomes:
midpoint = (chromosome_offsets[chrom] + chromosome_offsets[chrom] + data[data['chromosome'] == chrom]['cumulative_pos'].max()) // 2
tick_positions.append(midpoint)
chromosome_labels.append(chrom)
ax.set_xticks(tick_positions)
ax.set_xticklabels(chromosome_labels)
# 保存图表为PDF文件
pdf_path = 'Pi_manhadun.pdf'
with PdfPages(pdf_path) as pdf:
pdf.savefig()
# 显示图例
#ax.legend()
#plt.show()
2.PIC多态信息含量
-
多态信息含量(Polymorphic Information Content,PIC)是用来衡量一个标记(如SNP位点)的多态性以及其在群体中提供的信息量的指标。PIC值反映了一个标记在一个群体中区分不同基因型的能力。通常,PIC值越高,表示该标记在群体中的多态性越高,信息量也越大。*
### 计算maf
plink --file /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome --freq --out sct_allele_freq
利用R脚本计算PIC
# 读取等位基因频率文件
freq_data <- read.table("sct_allele_freq.frq", header = TRUE)
# 计算PIC值函数
calculate_pic <- function(p) {
q <- 1 - p
pic <- 1 - (p^2 + q^2) - 2 * p^2 * q^2
return(pic)
}
# 提取等位基因频率
freq_data$MAF <- as.numeric(freq_data$MAF)
freq_data$PIC <- sapply(freq_data$MAF, calculate_pic)
# 保存结果
write.table(freq_data, file = "allele_freq_with_PIC.txt", row.names = FALSE, quote = FALSE)
awk '{print $1,$7}' allele_freq_with_PIC.txt | sed "1d" > PIC.txt
awk '{print $2}' PIC.txt | datamash mean 1 sstdev 1 > PIC_mean.txt
3.Ho和He计算
plink --file /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome --hardy --out SCT
awk '{print $7 "\t" $8}' SCT.hwe | sed '1d' > SCT_het.txt
awk '{print $1}' SCT_het.txt | datamash mean 1 sstdev 1 > SCT_Ho.txt
awk '{print $2}' SCT_het.txt | datamash mean 1 sstdev 1 > SCT_He.txt
4.Nei多样性指数
Nei多样性指数(Nei's diversity index),也称为Nei's基因多样性,是衡量一个群体的遗传多样性的重要指标。由Nei(1973年)提出,它主要用于估算群体内的基因变异程度,特别是在分子进化和群体遗传学研究中广泛应用。
# 读取等位基因频率文件
freq_data <- read.table("../PIC/sct_allele_freq.frq", header = TRUE)
# 查看频率文件的结构
str(freq_data)
# 计算Nei多样性指数的函数
calculate_nei <- function(p) {
q <- 1 - p
H <- 1 - (p^2 + q^2)
return(H)
}
# 提取等位基因频率并计算Nei多样性指数
freq_data$MAF <- as.numeric(freq_data$MAF)
freq_data$Nei_H <- sapply(freq_data$MAF, calculate_nei)
# 保存结果
write.table(freq_data, file = "allele_freq_with_Nei_H.txt", row.names = FALSE, quote = FALSE)
# 显示结果
head(freq_data)
5.香浓维纳指数
香农-维纳指数(Shannon-Wiener Index)确实可以用于反映遗传多样性,尽管它在生态学中更常用于衡量物种多样性。遗传多样性是指一个群体中存在的遗传变异程度,而香农-维纳指数能够通过考虑等位基因的丰富度和均匀度来度量这种变异。
# 读取等位基因频率文件
freq_data <- read.table("../PIC/sct_allele_freq.frq", header = TRUE)
# 查看频率文件的结构
str(freq_data)
# 计算香农-维纳指数的函数
calculate_shannon <- function(p) {
q <- 1 - p
H <- - (p * log(p) + q * log(q))
return(H)
}
# 去除NA值
freq_data <- freq_data[!is.na(freq_data$MAF), ]
# 提取等位基因频率并计算香农-维纳指数
freq_data$Shannon_H <- sapply(freq_data$MAF, calculate_shannon)
# 保存结果
write.table(freq_data, file = "allele_freq_with_Shannon_H.txt", row.names = FALSE, quote = FALSE)
6.HOM/HET
基因组中的纯合位点数比上基因组中的杂合位点数
首先需要利用plink计算杂合位点数
vcftools --vcf /home/DL_2/CUTech/comput_2/Tech_No1/group4/yjc/workspace/SCT_data/var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome.vcf --het --out SCT.het
得到SCT.het.het文件后,在利用python脚本计算比值
# 打开输入文件和输出文件
with open('SCT.het.het', 'r') as infile, open('HOM_HET.txt', 'w') as outfile:
# 读取文件的第一行(列名)
header = infile.readline().strip().split()
# 写入输出文件的列名
outfile.write('ID\tO(HOM)\tO(HET)\tHOM/HET\n')
# 处理每一行数据
for line in infile:
data = line.strip().split()
# 获取ID、O(HOM)和N_SITES
ID = data[0]
O_HOM = int(data[1])
N_SITES = int(data[3])
# 计算O(HET)和HOM/HET
O_HET = N_SITES - O_HOM
HOM_HET_ratio = O_HOM / O_HET if O_HET != 0 else 'Inf'
# 写入输出文件
outfile.write(f'{ID}\t{O_HOM}\t{O_HET}\t{HOM_HET_ratio}\n')
7.TajimanD
vcftools --vcf ../../../var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome.vcf --TajimaD 500000 --out SCT_TajimaD
8.IBD的计算以及计算比例
计算IBD
plink --bfile ../../../var/01.combine_gvcf/04.deep_allele_maf/01.plink/SCT1_SNP_ind_maf0.05_autosome --genome --out ibd
利用R脚本计算比例
# 读取PLINK输出的genome文件
ibd_data <- read.table("ibd.genome", header = TRUE)
# 计算IBD比例的平均值
mean_ibd <- mean(ibd_data$PI_HAT)
# 创建一个包含结果的字符向量
result <- c("平均IBD比例:", mean_ibd)
# 打开文件并写入结果
writeLines(result, con = "result.txt")
# 保存所有个体对的IBD结果到文件
write.table(ibd_data, file = "ibd_result.txt", row.names = FALSE, quote = FALSE, sep = "\t")