双序列全局比对主要是依据Needleman-Wnnsch算法来进行
整个过程分为三步
1.设置一个矩阵:第一条序列长m,沿x轴排列,第二条序列长n,沿y轴排列
2.设置打分矩阵,根据适当的打分公式来对对应的碱基进行打分,有四种情况:1.两碱基完全匹配2.不匹配3.第一条序列引入空位4.第二条序列引入空位
3.反向寻找最佳比对,通过回溯法,从打分矩阵的右下方最大得分处,往左上方寻找最大值。所谓动态规划表示最佳路径是通过逐步延长最佳子路径得到的,即比对的每一步都选择获得分值高的碱基对作为比对的方法
总目标是沿着矩阵对角线找到最佳分值的路径,这条路径确定了最佳比对
Python代码
import numpy
import string
## Score matrix for nucleotide alignment
NUC44=numpy.array([[5,-4,-4,-4,-2],\
[-4,5,-4,-4,-2],\
[-4,-4,5,-4,-2],\
[-4,-4,-4,5,-2],\
[-2,-2,-2,-2,-1]])
NBET='ATGCN'
## define the function for calculating score matrix and arrow matrix:
def scoreMat(NUC44,NBET,seq1,seq2,gap=-8):
len1,len2=len(seq1),len(seq2)
scorMat=numpy.zeros((len1+1,len2+1),int)
arrowMat=numpy.zeros((len1+1,len2+1),int)
scorMat[0,:]=numpy.arange(len2+1)*gap
scorMat[:,0]=numpy.arange(len1+1)*gap
arrowMat[0,:]=numpy.ones(len2+1)
arrowMat[1:,0]=numpy.zeros(len1)
for i in range(1,len1+1):
for j in range(1,len2+1):
s_mat=numpy.zeros(3)
s_mat[0]=scorMat[i-1,j]+gap
s_mat[1]=scorMat[i,j-1]+gap
n1,n2=NBET.index(seq1[i-1]),NBET.index(seq2[j-1])
s_mat[2]=scorMat[i-1,j-1]+NUC44[n1,n2]
scorMat[i,j]=numpy.max(s_mat)
arrowMat[i,j]=numpy.argmax(s_mat)
return scorMat,arrowMat
## obtain the alignment of the sequences
def DynAlign(scorMat,arrow,seq1,seq2):
aln_seq1,aln_seq2='',''
flat_scorMat=numpy.ravel(scorMat)
v,h=divmod(numpy.argmax(flat_scorMat),len(seq2)+1)
print(v,h)
while True:
if arrow[v,h]==0:
aln_seq1+=seq1[v-1]
aln_seq2+='-'
v-=1
elif arrow[v,h]==1:
aln_seq1+='-'
aln_seq2+=seq2[h-1]
h-=1
elif arrow[v,h]==2:
aln_seq1+=seq1[v-1]
aln_seq2+=seq2[h-1]
v-=1
h-=1
if v==0 and h==0:
break
aln1=aln_seq1[::-1]
aln2=aln_seq2[::-1]
return aln1,aln2
sq1=input("Please input the sequence 1:")
sq2=input("Please input the sequence 2:")
scoreMatrix,arrowMatrix=scoreMat(NUC44,NBET,sq1,sq2,gap=-8)
aln1,aln2=DynAlign(scoreMatrix,arrowMatrix,sq1,sq2)
print('The score matrix is:')
print(scoreMatrix)
print('The arrow matrix is:')
print(arrowMatrix)
print('The aligned sequences are:')
print(aln1)
print(aln2)
C语言代码
#include <stdio.h>
#include <string.h>
char seq1_out[100] = "\0", seq2_out[100] = "\0";//定义比对后的输出一维数组
int scores[50][50] = { 0 };//定义得分矩阵
int score_array[5][5] = { {5,-4,-4,-4,-2},{-4,5,-4,-4,-2},{-4,-4,5,-4,-2},{-4,-4,-4,5,-2},{-2,-2,-2,-2,-1} };//定义打分矩阵
int main()
{
void output(char seq1[], char seq2[]);//引用输出函数
void scoresMat(char seq1[], char seq2[], char normol[], int gap = -8);//引用打分函数
char seq1[100] = "\0", seq2[100] = "\0", normol[] = "ATCGN";
int i, j;
//输入待比对序列
printf("Please input the sequence 1 :");
scanf("%s", seq1);
printf("Please input the sequence 2 :");
scanf("%s", seq2);
scoresMat(seq1, seq2, normol);
printf("the score array is:\n");
for (i = 0; i <= strlen(seq2); i++)
for (j = 0; j <= strlen(seq1); j++)
if (j == strlen(seq1))
printf("%d\n", scores[i][j]);
else
printf("%d\t", scores[i][j]);
output(seq1, seq2);
int out_len1 = strlen(seq1_out), out_len2 = strlen(seq2_out);
for (i = out_len1 - 1; i >= 0; i--)
printf("%c", seq1_out[i]);
printf("\n");
for (i = out_len2 - 1; i >= 0; i--)
printf("%c", seq2_out[i]);
printf("\n");
return 0;
}
void scoresMat(char seq1[], char seq2[], char normol[], int gap = -8)//打分函数
{
int find_index(char c, char normol[]);
int max(int score1, int score2, int score3);
int index1, index2;
int len1 = strlen(seq1), len2 = strlen(seq2);
int i = 0, j = 0, k = 0, temp = 0;
for (i = 1, temp = -8; i <= len1; i++, temp += gap)//空位罚分为-8
scores[0][i] = temp;
for (i = 1, temp = -8; i <= len2; i++, temp += gap)//空位罚分为-8
scores[i][0] = temp;
for (i = 1; i <= len2; i++)
for (j = 1; j <= len1; j++)
{
index1 = 0, index2 = 0;
int score1 = 0, score2 = 0, score3 = 0, final_score = 0;
index1 = find_index(seq2[i - 1], normol);//当i=1时,其实是第二条序列的第一个碱基,在序列中的索引为0,所以要减一
index2 = find_index(seq1[j - 1], normol);
score1 = scores[i - 1][j - 1] + score_array[index1][index2];
score2 = scores[i - 1][j] + gap;
score3 = scores[i][j - 1] + gap;
final_score = max(score1, score2, score3);
scores[i][j] = final_score;
}
}
int max(int score1, int score2, int score3)//求三个数最大值函数
{
if (score1 > score2)
if (score1 > score3)
return score1;
else
return score3;
else
if (score2 > score3)
return score2;
else
return score3;
}
int find_index(char c, char normol[])//寻找碱基在normol中的索引函数
{
int i = 0, index = 0;
for (i = 0; i < strlen(normol); i++)
if (c == normol[i])
index = i;
return index;
}
void output(char seq1[], char seq2[])//反向输出最佳匹配序列函数
{
int out_num, seq1_num = strlen(seq1) - 1, seq2_num = strlen(seq2) - 1;
int i, j;
int score_sum = 0;
int max(int score1, int score2, int score3);
int len1 = strlen(seq1), len2 = strlen(seq2);
//找出最后一行数值最大的数的位置
int max_score = scores[len2][0], max_j = 0;
for (j = 1; j <= len1; j++)
if (max_score <= scores[len2][j])
{
max_score = scores[len2][j];
max_j = j;
}
int col_max = scores[len2][max_j];
//找出最后一列数值最大的数的位置
int max_score1 = scores[0][len1], max_i = 0;
for (i = 1; i <= len2; i++)
if (max_score1 < scores[i][len1])
{
max_score1 = scores[i][len1];
max_i = i;
}
int row_max = scores[max_i][len1];
//找出最后一列和最后一行最大的数,作为反向找出最佳匹配的起始点
int start_i, start_j;
if (col_max >= row_max)
{
start_i = len2;
start_j = max_j;
}
if (col_max < row_max)
{
start_i = max_i;
start_j = len1;
}
//开始反向寻找最佳匹配
if (start_i < len2)
{
for (i = start_i + 1; i <= len2; i++)
score_sum += scores[i][len1];
for (out_num = 0; out_num < len2 - start_i; out_num++)
{
seq1_out[out_num] = '-';
seq2_out[out_num] = seq2[seq2_num];
seq2_num--;
}
}
if (start_j < len1)
{
for (j = start_j + 1; j<= len1; j++)
score_sum += scores[len2][j];
for (out_num = 0; out_num < len1 - start_j; out_num++)
{
seq2_out[out_num] = '-';
seq1_out[out_num] = seq1[seq1_num];
seq1_num--;
}
}
if (start_i == len2 && start_j == len1)
{
out_num = 0;
seq2_out[out_num] = seq2[seq2_num];
seq1_out[out_num] = seq1[seq1_num];
out_num++; seq1_num--; seq2_num--;
}
score_sum += scores[start_i][start_j];
while (start_i > 1 && start_j > 1 && seq1_num > 0 && seq2_num > 0)
{
int score_max = 0, score1 = 0, score2 = 0, score3 = 0;//score1为(i,j)左边的分数,score2为(i,j)上边的分数,score3为(i,j)对角线上方的分数
score1 = scores[start_i][start_j - 1];
score2 = scores[start_i - 1][start_j];
score3 = scores[start_i - 1][start_j - 1];
score_max = max(score1, score2, score3);
score_sum += score_max;
if (score3 == score_max)
{
start_i = start_i - 1;
start_j = start_j - 1;
seq1_out[out_num] = seq1[seq1_num];
seq2_out[out_num] = seq2[seq2_num];
out_num++; seq1_num--; seq2_num--;
}
if (score1 == score_max && score2 != score_max)
{
start_j = start_j - 1;
seq1_out[out_num] = seq1[seq1_num];
seq2_out[out_num] = '-';
out_num++; seq1_num--;
}
if (score2 == score_max && score1 != score_max)
{
start_i = start_i - 1;
seq2_out[out_num] = seq2[seq2_num];
seq1_out[out_num] = '-';
out_num++; seq2_num--;
}
if (score2 == score_max && score1 == score_max)
{
start_i = start_i - 1;
seq2_out[out_num] = seq2[seq2_num];
seq1_out[out_num] = '-';
out_num++; seq2_num--;
}
}
if (start_i == 1 && start_j != 1)
while (seq1_num >= 0)
{
seq1_out[out_num] = seq1[seq1_num];
seq2_out[out_num] = '-';
seq1_num--; out_num++;
}
if (start_j == 1 && start_i != 1)
while (seq2_num >= 0)
{
seq2_out[out_num] = seq2[seq2_num];
seq1_out[out_num] = '-';
seq2_num--; out_num++;
}
if (start_j == 1 && start_i == 1)
{
seq2_out[out_num] = seq2[0];
seq1_out[out_num] = seq1[0];
}
printf("The best scores is %d\n", score_sum);
}
结果如下图:
代码仅供参考和学习,如果有问题,请联系我,我们可以一起讨论!!!