from __future__ import division from random import choice blosum50 = {'A': {'A': 5.0, 'C': -1.0, 'D': -2.0, 'E': -1.0, 'F': -3.0, 'G': 0.0, 'H': -2.0, 'I': -1.0, 'K': -1.0, 'L': -2.0, 'M': -1.0, 'N': -1.0, 'P': -1.0, 'Q': -1.0, 'R': -2.0, 'S': 1.0, 'T': 0.0, 'V': 0.0, 'W': -3.0, 'Y': -2.0}, 'C': {'A': -1.0, 'C': 13.0, 'D': -4.0, 'E': -3.0, 'F': -2.0, 'G': -3.0, 'H': -3.0, 'I': -2.0, 'K': -3.0, 'L': -2.0, 'M': -2.0, 'N': -2.0, 'P': -4.0, 'Q': -3.0, 'R': -4.0, 'S': -1.0, 'T': -1.0, 'V': -1.0, 'W': -5.0, 'Y': -3.0}, 'D': {'A': -2.0, 'C': -4.0, 'D': 8.0, 'E': 2.0, 'F': -5.0, 'G': -1.0, 'H': -1.0, 'I': -4.0, 'K': -1.0, 'L': -4.0, 'M': -4.0, 'N': 2.0, 'P': -1.0, 'Q': 0.0, 'R': -2.0, 'S': 0.0, 'T': -1.0, 'V': -4.0, 'W': -5.0, 'Y': -3.0}, 'E': {'A': -1.0, 'C': -3.0, 'D': 2.0, 'E': 6.0, 'F': -3.0, 'G': -3.0, 'H': 0.0, 'I': -4.0, 'K': 1.0, 'L': -3.0, 'M': -2.0, 'N': 0.0, 'P': -1.0, 'Q': 2.0, 'R': 0.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -2.0}, 'F': {'A': -3.0, 'C': -2.0, 'D': -5.0, 'E': -3.0, 'F': 8.0, 'G': -4.0, 'H': -1.0, 'I': 0.0, 'K': -4.0, 'L': 1.0, 'M': 0.0, 'N': -4.0, 'P': -4.0, 'Q': -4.0, 'R': -3.0, 'S': -3.0, 'T': -2.0, 'V': -1.0, 'W': 1.0, 'Y': 4.0}, 'G': {'A': 0.0, 'C': -3.0, 'D': -1.0, 'E': -3.0, 'F': -4.0, 'G': 8.0, 'H': -2.0, 'I': -4.0, 'K': -2.0, 'L': -4.0, 'M': -3.0, 'N': 0.0, 'P': -2.0, 'Q': -2.0, 'R': -3.0, 'S': 0.0, 'T': -2.0, 'V': -4.0, 'W': -3.0, 'Y': -3.0}, 'H': {'A': -2.0, 'C': -3.0, 'D': -1.0, 'E': 0.0, 'F': -1.0, 'G': -2.0, 'H': 10.0, 'I': -4.0, 'K': 0.0, 'L': -3.0, 'M': -1.0, 'N': 1.0, 'P': -2.0, 'Q': 1.0, 'R': 0.0, 'S': -1.0, 'T': -2.0, 'V': -4.0, 'W': -3.0, 'Y': 2.0}, 'I': {'A': -1.0, 'C': -2.0, 'D': -4.0, 'E': -4.0, 'F': 0.0, 'G': -4.0, 'H': -4.0, 'I': 5.0, 'K': -3.0, 'L': 2.0, 'M': 2.0, 'N': -3.0, 'P': -3.0, 'Q': -3.0, 'R': -4.0, 'S': -3.0, 'T': -1.0, 'V': 4.0, 'W': -3.0, 'Y': -1.0}, 'K': {'A': -1.0, 'C': -3.0, 'D': -1.0, 'E': 1.0, 'F': -4.0, 'G': -2.0, 'H': 0.0, 'I': -3.0, 'K': 6.0, 'L': -3.0, 'M': -2.0, 'N': 0.0, 'P': -1.0, 'Q': 2.0, 'R': 3.0, 'S': 0.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -2.0}, 'L': {'A': -2.0, 'C': -2.0, 'D': -4.0, 'E': -3.0, 'F': 1.0, 'G': -4.0, 'H': -3.0, 'I': 2.0, 'K': -3.0, 'L': 5.0, 'M': 3.0, 'N': -4.0, 'P': -4.0, 'Q': -2.0, 'R': -3.0, 'S': -3.0, 'T': -1.0, 'V': 1.0, 'W': -2.0, 'Y': -1.0}, 'M': {'A': -1.0, 'C': -2.0, 'D': -4.0, 'E': -2.0, 'F': 0.0, 'G': -3.0, 'H': -1.0, 'I': 2.0, 'K': -2.0, 'L': 3.0, 'M': 7.0, 'N': -2.0, 'P': -3.0, 'Q': 0.0, 'R': -2.0, 'S': -2.0, 'T': -1.0, 'V': 1.0, 'W': -1.0, 'Y': 0.0}, 'N': {'A': -1.0, 'C': -2.0, 'D': 2.0, 'E': 0.0, 'F': -4.0, 'G': 0.0, 'H': 1.0, 'I': -3.0, 'K': 0.0, 'L': -4.0, 'M': -2.0, 'N': 7.0, 'P': -2.0, 'Q': 0.0, 'R': -1.0, 'S': 1.0, 'T': 0.0, 'V': -3.0, 'W': -4.0, 'Y': -2.0}, 'P': {'A': -1.0, 'C': -4.0, 'D': -1.0, 'E': -1.0, 'F': -4.0, 'G': -2.0, 'H': -2.0, 'I': -3.0, 'K': -1.0, 'L': -4.0, 'M': -3.0, 'N': -2.0, 'P': 10.0, 'Q': -1.0, 'R': -3.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -4.0, 'Y': -3.0}, 'Q': {'A': -1.0, 'C': -3.0, 'D': 0.0, 'E': 2.0, 'F': -4.0, 'G': -2.0, 'H': 1.0, 'I': -3.0, 'K': 2.0, 'L': -2.0, 'M': 0.0, 'N': 0.0, 'P': -1.0, 'Q': 7.0, 'R': 1.0, 'S': 0.0, 'T': -1.0, 'V': -3.0, 'W': -1.0, 'Y': -1.0}, 'R': {'A': -2.0, 'C': -4.0, 'D': -2.0, 'E': 0.0, 'F': -3.0, 'G': -3.0, 'H': 0.0, 'I': -4.0, 'K': 3.0, 'L': -3.0, 'M': -2.0, 'N': -1.0, 'P': -3.0, 'Q': 1.0, 'R': 7.0, 'S': -1.0, 'T': -1.0, 'V': -3.0, 'W': -3.0, 'Y': -1.0}, 'S': {'A': 1.0, 'C': -1.0, 'D': 0.0, 'E': -1.0, 'F': -3.0, 'G': 0.0, 'H': -1.0, 'I': -3.0, 'K': 0.0, 'L': -3.0, 'M': -2.0, 'N': 1.0, 'P': -1.0, 'Q': 0.0, 'R': -1.0, 'S': 5.0, 'T': 2.0, 'V': -2.0, 'W': -4.0, 'Y': -2.0}, 'T': {'A': 0.0, 'C': -1.0, 'D': -1.0, 'E': -1.0, 'F': -2.0, 'G': -2.0, 'H': -2.0, 'I': -1.0, 'K': -1.0, 'L': -1.0, 'M': -1.0, 'N': 0.0, 'P': -1.0, 'Q': -1.0, 'R': -1.0, 'S': 2.0, 'T': 5.0, 'V': 0.0, 'W': -3.0, 'Y': -2.0}, 'V': {'A': 0.0, 'C': -1.0, 'D': -4.0, 'E': -3.0, 'F': -1.0, 'G': -4.0, 'H': -4.0, 'I': 4.0, 'K': -3.0, 'L': 1.0, 'M': 1.0, 'N': -3.0, 'P': -3.0, 'Q': -3.0, 'R': -3.0, 'S': -2.0, 'T': 0.0, 'V': 5.0, 'W': -3.0, 'Y': -1.0}, 'W': {'A': -3.0, 'C': -5.0, 'D': -5.0, 'E': -3.0, 'F': 1.0, 'G': -3.0, 'H': -3.0, 'I': -3.0, 'K': -3.0, 'L': -2.0, 'M': -1.0, 'N': -4.0, 'P': -4.0, 'Q': -1.0, 'R': -3.0, 'S': -4.0, 'T': -3.0, 'V': -3.0, 'W': 15.0, 'Y': 2.0}, 'Y': {'A': -2.0, 'C': -3.0, 'D': -3.0, 'E': -2.0, 'F': 4.0, 'G': -3.0, 'H': 2.0, 'I': -1.0, 'K': -2.0, 'L': -1.0, 'M': 0.0, 'N': -2.0, 'P': -3.0, 'Q': -1.0, 'R': -1.0, 'S': -2.0, 'T': -2.0, 'V': -1.0, 'W': 2.0, 'Y': 8.0}} def blosum50_scorer(x,y): return blosum50[x][y] def generate_score_matrix(seq1,seq2,substitution_matrix): # Initialize a matrix to use for storing the scores score_matrix = [] # Iterate over the amino acids in sequence two (which will correspond # to the vertical sequence in the matrix) for aa2 in seq2: # Initialize the current row of the matrix current_row = [] # Iterate over the amino acids in sequence one (which will # correspond to the horizontal sequence in the matrix) for aa1 in seq1: # score as 1 if the bases are equal and 0 if they're not current_row.append(blosum50[aa1][aa2]) # append the current row to the matrix score_matrix.append(current_row) return score_matrix def generate_nw_and_traceback_matrices(seq1,seq2,gap_penalty,substitution_matrix): # Initialize a matrix to use for scoring the alignment and for tracing # back the best alignment nw_matrix = [[-gap_penalty * i for i in range(0,len(seq1)+1)]] traceback_matrix = [[None] + ['-' for i in range(0,len(seq1))]] # Iterate over the amino acids in sequence two (which will correspond # to the vertical sequence in the matrix) # Note that i corresponds to column numbers, as in the 'Biological Sequence # Analysis' example from class for i,aa2 in zip(range(1,len(seq2)+1),seq2): # Initialize the current row of the matrix current_row = [i * -gap_penalty] current_traceback_matrix_row = ['|'] # Iterate over the amino acids in sequence one (which will # correspond to the horizontal sequence in the matrix) # Note that j corresponds to row numbers, as in the 'Biological Sequence # Analysis' example from class for j,aa1 in zip(range(1,len(seq1)+1),seq1): substitution_score = substitution_matrix[aa1][aa2] diag_score = (nw_matrix[i-1][j-1] + substitution_score,'\\') up_score = (nw_matrix[i-1][j] - gap_penalty,'|') left_score = (current_row[-1] - gap_penalty,'-') best_score = max(diag_score,up_score,left_score) current_row.append(best_score[0]) current_traceback_matrix_row.append(best_score[1]) # append the current row to the matrix nw_matrix.append(current_row) traceback_matrix.append(current_traceback_matrix_row) return nw_matrix, traceback_matrix def nw_traceback(traceback_matrix,nw_matrix,seq1,seq2,gap_character='-'): aligned_seq1 = [] aligned_seq2 = [] current_row = len(traceback_matrix) - 1 current_col = len(traceback_matrix[0]) - 1 best_score = nw_matrix[current_row][current_col] while True: current_value = traceback_matrix[current_row][current_col] if current_value == '\\': aligned_seq1.append(seq1[current_col-1]) aligned_seq2.append(seq2[current_row-1]) current_row -= 1 current_col -= 1 elif current_value == '|': aligned_seq1.append('-') aligned_seq2.append(seq2[current_row-1]) current_row -= 1 elif current_value == '-': aligned_seq1.append(seq1[current_col-1]) aligned_seq2.append('-') current_col -= 1 elif current_value == None: break else: raise ValueError, "Invalid value in traceback matrix: %s" % current_value return ''.join(aligned_seq1[::-1]), ''.join(aligned_seq2[::-1]), best_score def format_score_matrix(seq1,seq2,score_matrix,title): # define a format string that will be used to format each line - # since seq1 is the 'horizontal' sequence in our matrix we'll # have len(seq1) + 1 entries on each line to print the scores and # seq2 base associated with each row line_format = "%6s" * (len(seq1) + 1) print "\n%s" % title # print seq1 (start the line with an empty string) print line_format % tuple([' '] + map(str,list(seq1))) # iterate over the rows and print each (starting with the # corresponding base in sequence2) for row, base in zip(score_matrix,seq2): print line_format % tuple([base] + map(str,row)) def format_dynamic_programming_matrix(seq1,seq2,matrix,title): print "\n%s" % title line_format = "%6s" * (len(seq1) + 2) # print seq1 (start the line with two empty strings) print line_format % tuple([' ',' '] + map(str,list(seq1))) # iterate over the rows and print each (starting with the # corresponding base in sequence2) for row, base in zip(matrix,' ' + seq2): print line_format % tuple([base] + map(str,row)) def main(): seq1 = "HEAGAWGHEE" seq2 = "PAWHEAE" score_matrix = generate_score_matrix(seq1,seq2,blosum50) format_score_matrix(seq1, seq2, score_matrix, title="Score matrix (based on BLOSUM50)") nw_matrix, traceback_matrix = generate_nw_and_traceback_matrices(seq1, seq2, 8, blosum50) aligned_seq1, aligned_seq2, score = nw_traceback(traceback_matrix,nw_matrix,seq1,seq2) format_dynamic_programming_matrix(seq1, seq2, nw_matrix, title="Global dynamic programming matrix") format_dynamic_programming_matrix(seq1, seq2, traceback_matrix, title="Traceback matrix") print "\nAlignment:" print aligned_seq1 print aligned_seq2 print '\nAlignment score:' print score main()