usage:
python draw_step4.structure.py -n UPGMA.nwk --fam ./SNP_maf0.05_miss0.5.fam -Q ./SNP_maf_miss_ld -K 6 -o test.pdf
脚本
首先导入需要用到的包。--order和-n都是用于指定顺序的,有一个参数就可以了。其他都是必选参数。
import argparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib.patches import Patch
from Bio import Phylo
sns.set_style("darkgrid")
# 创建ArgumentParser对象
parser = argparse.ArgumentParser(description="draw a population structure plot")
# 添加外部参数
parser.add_argument('-n', "--newick", type=str, help='input a newick file')
parser.add_argument('--order', type=str, help='input an ordered sample name list')
parser.add_argument("-f", "--fam", type=str, help="input the order of bed")
parser.add_argument('-Q', type=str, help="the structure.Q path")
parser.add_argument('-K', type=int, help="the number of Q")
parser.add_argument('-o', "--output", type=str, help='output the population structure plot pdf')
args = parser.parse_args()
核心的排序函数和绘图函数
def read_nwk(nwk: str,root:str=None) -> list:
tree = Phylo.read(nwk, "newick")
if root:
new_root_clade = tree.common_ancestor(root)
tree.root_with_outgroup(new_root_clade)
nwk_list = [clade.name for clade in tree.get_terminals()][::-1]
return nwk_list
def read_structure_Q(prefix_path: str, order_list: list, i: int, fam_list: list) -> pd.DataFrame:
df = pd.read_csv(f"{prefix_path}.{str(i)}.Q", sep=" ", header=None)
df.index = fam_list
df = df.T[order_list]
return df
def draw_structure(df: pd.DataFrame, n: int, cmap: sns.cubehelix_palette, axes) -> None:
for i in range(len(df)):
sns.barplot(x=df.columns, y=df.iloc[i], color=cmap[i], bottom=np.sum(df.iloc[:i], axis=0), ax=axes[n],
dodge=False,
width=1, edgecolor='w')
读入数据,调颜色
# 读入数据
fam = args.fam
K = args.K
prefix_path = args.Q
nwk = args.newick
fam_list = [x.split(" ")[0] for x in open(fam, "rt").readlines()]
if args.order:
print(args.order)
order = args.order
order_list = [x.split("\t")[0] for x in open(order, "rt").readlines()]
else:
order_list = read_nwk(nwk)
cmap = sns.color_palette("hls", n_colors=K)
画图
中间上色的部分和自己的order.list有关,颜色也是自己去调(某个的颜色对应某个亚群)
# 给所有文字放大到两倍
sns.set(context='notebook', style='ticks', font_scale=2)
fig, axes = plt.subplots(nrows=K - 1, ncols=1, figsize=(70, K * 2))
for i in range(2, K + 1):
# 读 Q 矩阵并调顺序
df = read_structure_Q(prefix_path, order_list, i, fam_list)
n = i - 2
if n == K - 2:
# cmap[3], cmap[1], cmap[0], cmap[2] = cmap[0], cmap[3], cmap[2], cmap[1]
# 每次循环画一张图出来
draw_structure(df, n, cmap, axes)
# 最后一行保留样本名
axes[n].set(xlabel="", ylabel=f"K={i}", yticks=[])
# 旋转 x 轴标签
axes[n].set_xticklabels(axes[n].get_xticklabels(), rotation=45, ha='right')
# 上色
# for i, label in enumerate(axes[n].xaxis.get_ticklabels()):
# if 1 < i < 30:
# color = cmap[3]
# elif 29 < i < 51:
# color = cmap[1]
# elif 50 < i < 66:
# color = cmap[0]
# elif 65 < i < 77:
# color = cmap[2]
# elif 76 < i < 90:
# color = cmap[4]
# else:
# color = "black"
# label.set_color(color)
# 处理标签和颜色
# legend_labels = {'Tropical': cmap[3], 'Lancaster': cmap[1], 'P_group': cmap[0], 'Reid': cmap[2], 'IDT': cmap[4],
# "Mixed": "black"} # 用于存储图例标签
# legend_patches = [] # 用于存储图例的 Patch 对象
# for label, color in legend_labels.items():
# legend_patches.append(Patch(color=color, label=label))
# axes[K - 2].legend(handles=legend_patches, loc='upper center', bbox_to_anchor=(0.5, -0.8), ncol=K + 1)
else:
draw_structure(df, n, cmap, axes)
axes[n].set(xlabel="", xticks=[], ylabel=f"K={i}", yticks=[])
plt.margins(x=0, y=0)
plt.tight_layout(h_pad=0)
plt.subplots_adjust(hspace=0, wspace=0)
plt.savefig(args.output, format="pdf", dpi=300)
plt.show()