In this notebook I implement the basic feature of Needleman-Wunsch sequence alignment. You will build from here to complete homework #3.
from __future__ import division
from random import choice
Define a substitution matrix and a scorer
function to look values up in that substitution matrix.
blosum50 = {'A': {'A': 5.0, 'C': -1.0, 'D': -2.0, 'E': -1.0, 'F': -3.0, 'G': 0.0, 'H': -2.0, 'I': -1.0, 'K': -1.0, 'L': -2.0, 'M': -1.0, 'N': -1.0, 'P': -1.0, 'Q': -1.0, 'R': -2.0, 'S': 1.0, 'T': 0.0, 'V': 0.0, 'W': -3.0, 'Y': -2.0},
'C': {'A': -1.0, 'C': 13.0, 'D': -4.0, 'E': -3.0, 'F': -2.0, 'G': -3.0, 'H': -3.0, 'I': -2.0, 'K': -3.0, 'L': -2.0, 'M': -2.0, 'N': -2.0, 'P': -4.0, 'Q': -3.0, 'R': -4.0, 'S': -1.0, 'T': -1.0, 'V': -1.0, 'W': -5.0, 'Y': -3.0},
'D': {'A': -2.0, 'C': -4.0, 'D': 8.0, 'E': 2.0, 'F': -5.0, 'G': -1.0, 'H': -1.0, 'I': -4.0, 'K': -1.0, 'L': -4.0, 'M': -4.0, 'N': 2.0, 'P': -1.0, 'Q': 0.0, 'R': -2.0, 'S': 0.0, 'T': -1.0, 'V': -4.0, 'W': -5.0, 'Y': -3.0},
'E': {'A': -1.0, 'C': -3.0, 'D': 2.0, 'E': 6.0, 'F': -3.0, 'G': -3.0, 'H': 0.0, 'I': -4.0, 'K': 1.0, 'L': -3.0, 'M': -2.0, 'N': 0.0, 'P': -1.0, 'Q': 2.0, 'R': 0.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -2.0},
'F': {'A': -3.0, 'C': -2.0, 'D': -5.0, 'E': -3.0, 'F': 8.0, 'G': -4.0, 'H': -1.0, 'I': 0.0, 'K': -4.0, 'L': 1.0, 'M': 0.0, 'N': -4.0, 'P': -4.0, 'Q': -4.0, 'R': -3.0, 'S': -3.0, 'T': -2.0, 'V': -1.0, 'W': 1.0, 'Y': 4.0},
'G': {'A': 0.0, 'C': -3.0, 'D': -1.0, 'E': -3.0, 'F': -4.0, 'G': 8.0, 'H': -2.0, 'I': -4.0, 'K': -2.0, 'L': -4.0, 'M': -3.0, 'N': 0.0, 'P': -2.0, 'Q': -2.0, 'R': -3.0, 'S': 0.0, 'T': -2.0, 'V': -4.0, 'W': -3.0, 'Y': -3.0},
'H': {'A': -2.0, 'C': -3.0, 'D': -1.0, 'E': 0.0, 'F': -1.0, 'G': -2.0, 'H': 10.0, 'I': -4.0, 'K': 0.0, 'L': -3.0, 'M': -1.0, 'N': 1.0, 'P': -2.0, 'Q': 1.0, 'R': 0.0, 'S': -1.0, 'T': -2.0, 'V': -4.0, 'W': -3.0, 'Y': 2.0},
'I': {'A': -1.0, 'C': -2.0, 'D': -4.0, 'E': -4.0, 'F': 0.0, 'G': -4.0, 'H': -4.0, 'I': 5.0, 'K': -3.0, 'L': 2.0, 'M': 2.0, 'N': -3.0, 'P': -3.0, 'Q': -3.0, 'R': -4.0, 'S': -3.0, 'T': -1.0, 'V': 4.0, 'W': -3.0, 'Y': -1.0},
'K': {'A': -1.0, 'C': -3.0, 'D': -1.0, 'E': 1.0, 'F': -4.0, 'G': -2.0, 'H': 0.0, 'I': -3.0, 'K': 6.0, 'L': -3.0, 'M': -2.0, 'N': 0.0, 'P': -1.0, 'Q': 2.0, 'R': 3.0, 'S': 0.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -2.0},
'L': {'A': -2.0, 'C': -2.0, 'D': -4.0, 'E': -3.0, 'F': 1.0, 'G': -4.0, 'H': -3.0, 'I': 2.0, 'K': -3.0, 'L': 5.0, 'M': 3.0, 'N': -4.0, 'P': -4.0, 'Q': -2.0, 'R': -3.0, 'S': -3.0, 'T': -1.0, 'V': 1.0, 'W': -2.0, 'Y': -1.0},
'M': {'A': -1.0, 'C': -2.0, 'D': -4.0, 'E': -2.0, 'F': 0.0, 'G': -3.0, 'H': -1.0, 'I': 2.0, 'K': -2.0, 'L': 3.0, 'M': 7.0, 'N': -2.0, 'P': -3.0, 'Q': 0.0, 'R': -2.0, 'S': -2.0, 'T': -1.0, 'V': 1.0, 'W': -1.0, 'Y': 0.0},
'N': {'A': -1.0, 'C': -2.0, 'D': 2.0, 'E': 0.0, 'F': -4.0, 'G': 0.0, 'H': 1.0, 'I': -3.0, 'K': 0.0, 'L': -4.0, 'M': -2.0, 'N': 7.0, 'P': -2.0, 'Q': 0.0, 'R': -1.0, 'S': 1.0, 'T': 0.0, 'V': -3.0, 'W': -4.0, 'Y': -2.0},
'P': {'A': -1.0, 'C': -4.0, 'D': -1.0, 'E': -1.0, 'F': -4.0, 'G': -2.0, 'H': -2.0, 'I': -3.0, 'K': -1.0, 'L': -4.0, 'M': -3.0, 'N': -2.0, 'P': 10.0, 'Q': -1.0, 'R': -3.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -4.0, 'Y': -3.0},
'Q': {'A': -1.0, 'C': -3.0, 'D': 0.0, 'E': 2.0, 'F': -4.0, 'G': -2.0, 'H': 1.0, 'I': -3.0, 'K': 2.0, 'L': -2.0, 'M': 0.0, 'N': 0.0, 'P': -1.0, 'Q': 7.0, 'R': 1.0, 'S': 0.0, 'T': -1.0, 'V': -3.0, 'W': -1.0, 'Y': -1.0},
'R': {'A': -2.0, 'C': -4.0, 'D': -2.0, 'E': 0.0, 'F': -3.0, 'G': -3.0, 'H': 0.0, 'I': -4.0, 'K': 3.0, 'L': -3.0, 'M': -2.0, 'N': -1.0, 'P': -3.0, 'Q': 1.0, 'R': 7.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -1.0},
'S': {'A': 1.0, 'C': -1.0, 'D': 0.0, 'E': -1.0, 'F': -3.0, 'G': 0.0, 'H': -1.0, 'I': -3.0, 'K': 0.0, 'L': -3.0, 'M': -2.0, 'N': 1.0, 'P': -1.0, 'Q': 0.0, 'R': -1.0, 'S': 5.0, 'T': 2.0, 'V': -2.0, 'W': -4.0, 'Y': -2.0},
'T': {'A': 0.0, 'C': -1.0, 'D': -1.0, 'E': -1.0, 'F': -2.0, 'G': -2.0, 'H': -2.0, 'I': -1.0, 'K': -1.0, 'L': -1.0, 'M': -1.0, 'N': 0.0, 'P': -1.0, 'Q': -1.0, 'R': -1.0, 'S': 2.0, 'T': 5.0, 'V': 0.0, 'W': -3.0, 'Y': -2.0},
'V': {'A': 0.0, 'C': -1.0, 'D': -4.0, 'E': -3.0, 'F': -1.0, 'G': -4.0, 'H': -4.0, 'I': 4.0, 'K': -3.0, 'L': 1.0, 'M': 1.0, 'N': -3.0, 'P': -3.0, 'Q': -3.0, 'R': -3.0, 'S': -2.0, 'T': 0.0, 'V': 5.0, 'W': -3.0, 'Y': -1.0},
'W': {'A': -3.0, 'C': -5.0, 'D': -5.0, 'E': -3.0, 'F': 1.0, 'G': -3.0, 'H': -3.0, 'I': -3.0, 'K': -3.0, 'L': -2.0, 'M': -1.0, 'N': -4.0, 'P': -4.0, 'Q': -1.0, 'R': -3.0, 'S': -4.0, 'T': -3.0, 'V': -3.0, 'W': 15.0, 'Y': 2.0},
'Y': {'A': -2.0, 'C': -3.0, 'D': -3.0, 'E': -2.0, 'F': 4.0, 'G': -3.0, 'H': 2.0, 'I': -1.0, 'K': -2.0, 'L': -1.0, 'M': 0.0, 'N': -2.0, 'P': -3.0, 'Q': -1.0, 'R': -1.0, 'S': -2.0, 'T': -2.0, 'V': -1.0, 'W': 2.0, 'Y': 8.0}}
def blosum50_scorer(x,y):
return blosum50[x][y]
Define a function to compute a score matrix, given two sequences and a score matrix.
def generate_score_matrix(seq1,seq2,substitution_matrix):
# Initialize a matrix to use for storing the scores
score_matrix = []
# Iterate over the amino acids in sequence two (which will correspond
# to the vertical sequence in the matrix)
for aa2 in seq2:
# Initialize the current row of the matrix
current_row = []
# Iterate over the amino acids in sequence one (which will
# correspond to the horizontal sequence in the matrix)
for aa1 in seq1:
# score as 1 if the bases are equal and 0 if they're not
current_row.append(blosum50[aa1][aa2])
# append the current row to the matrix
score_matrix.append(current_row)
return score_matrix
Define a function to compute the Needleman-Wunsch matrix and the traceback matrix.
def generate_nw_and_traceback_matrices(seq1,seq2,gap_penalty,substitution_matrix):
# Initialize a matrix to use for scoring the alignment and for tracing
# back the best alignment
nw_matrix = [[-gap_penalty * i for i in range(0,len(seq1)+1)]]
traceback_matrix = [[None] + ['-' for i in range(0,len(seq1))]]
# Iterate over the amino acids in sequence two (which will correspond
# to the vertical sequence in the matrix)
# Note that i corresponds to column numbers, as in the 'Biological Sequence
# Analysis' example from class
for i,aa2 in zip(range(1,len(seq2)+1),seq2):
# Initialize the current row of the matrix
current_row = [i * -gap_penalty]
current_traceback_matrix_row = ['|']
# Iterate over the amino acids in sequence one (which will
# correspond to the horizontal sequence in the matrix)
# Note that j corresponds to row numbers, as in the 'Biological Sequence
# Analysis' example from class
for j,aa1 in zip(range(1,len(seq1)+1),seq1):
substitution_score = substitution_matrix[aa1][aa2]
diag_score = (nw_matrix[i-1][j-1] + substitution_score,'\\')
up_score = (nw_matrix[i-1][j] - gap_penalty,'|')
left_score = (current_row[-1] - gap_penalty,'-')
best_score = max(diag_score,up_score,left_score)
current_row.append(best_score[0])
current_traceback_matrix_row.append(best_score[1])
# append the current row to the matrix
nw_matrix.append(current_row)
traceback_matrix.append(current_traceback_matrix_row)
return nw_matrix, traceback_matrix
Define a function to perform the traceback.
def nw_traceback(traceback_matrix,nw_matrix,seq1,seq2,gap_character='-'):
aligned_seq1 = []
aligned_seq2 = []
current_row = len(traceback_matrix) - 1
current_col = len(traceback_matrix[0]) - 1
best_score = nw_matrix[current_row][current_col]
while True:
current_value = traceback_matrix[current_row][current_col]
if current_value == '\\':
aligned_seq1.append(seq1[current_col-1])
aligned_seq2.append(seq2[current_row-1])
current_row -= 1
current_col -= 1
elif current_value == '|':
aligned_seq1.append('-')
aligned_seq2.append(seq2[current_row-1])
current_row -= 1
elif current_value == '-':
aligned_seq1.append(seq1[current_col-1])
aligned_seq2.append('-')
current_col -= 1
elif current_value == None:
break
else:
raise ValueError, "Invalid value in traceback matrix: %s" % current_value
return ''.join(aligned_seq1[::-1]), ''.join(aligned_seq2[::-1]), best_score
Define some functions to conveniently format the score matrix and the dynamic programming (in our case the Needleman-Wunsch) matrix for viewing.
def format_score_matrix(seq1,seq2,score_matrix,title):
# define a format string that will be used to format each line -
# since seq1 is the 'horizontal' sequence in our matrix we'll
# have len(seq1) + 1 entries on each line to print the scores and
# seq2 base associated with each row
line_format = "%6s" * (len(seq1) + 1)
print "\n%s" % title
# print seq1 (start the line with an empty string)
print line_format % tuple([' '] + map(str,list(seq1)))
# iterate over the rows and print each (starting with the
# corresponding base in sequence2)
for row, base in zip(score_matrix,seq2):
print line_format % tuple([base] + map(str,row))
def format_dynamic_programming_matrix(seq1,seq2,matrix,title):
print "\n%s" % title
line_format = "%6s" * (len(seq1) + 2)
# print seq1 (start the line with two empty strings)
print line_format % tuple([' ',' '] + map(str,list(seq1)))
# iterate over the rows and print each (starting with the
# corresponding base in sequence2)
for row, base in zip(matrix,' ' + seq2):
print line_format % tuple([base] + map(str,row))
def main():
seq1 = "HEAGAWGHEE"
seq2 = "PAWHEAE"
score_matrix = generate_score_matrix(seq1,seq2,blosum50)
format_score_matrix(seq1,
seq2,
score_matrix,
title="Score matrix (based on BLOSUM50)")
nw_matrix, traceback_matrix = generate_nw_and_traceback_matrices(seq1,
seq2,
8,
blosum50)
aligned_seq1, aligned_seq2, score = nw_traceback(traceback_matrix,nw_matrix,seq1,seq2)
format_dynamic_programming_matrix(seq1,
seq2,
nw_matrix,
title="Global dynamic programming matrix")
format_dynamic_programming_matrix(seq1,
seq2,
traceback_matrix,
title="Traceback matrix")
print "\nAlignment:"
print aligned_seq1
print aligned_seq2
print '\nAlignment score:'
print score
main()