from yahmm import * model = Model( name="Global Sequence Aligner" ) # Define the distribution for insertions i_d = DiscreteDistribution( { 'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25 } ) # Create the insert states i0 = State( i_d, name="I0" ) i1 = State( i_d, name="I1" ) i2 = State( i_d, name="I2" ) i3 = State( i_d, name="I3" ) # Create the match states m1 = State( DiscreteDistribution({ "A": 0.95, 'C': 0.01, 'G': 0.01, 'T': 0.02 }) , name="M1" ) m2 = State( DiscreteDistribution({ "A": 0.003, 'C': 0.99, 'G': 0.003, 'T': 0.004 }) , name="M2" ) m3 = State( DiscreteDistribution({ "A": 0.01, 'C': 0.01, 'G': 0.01, 'T': 0.97 }) , name="M3" ) # Create the delete states d1 = State( None, name="D1" ) d2 = State( None, name="D2" ) d3 = State( None, name="D3" ) # Add all the states to the model model.add_states( [i0, i1, i2, i3, m1, m2, m3, d1, d2, d3 ] ) # Create transitions from match states model.add_transition( model.start, m1, 0.9 ) model.add_transition( model.start, i0, 0.1 ) model.add_transition( m1, m2, 0.9 ) model.add_transition( m1, i1, 0.05 ) model.add_transition( m1, d2, 0.05 ) model.add_transition( m2, m3, 0.9 ) model.add_transition( m2, i2, 0.05 ) model.add_transition( m2, d3, 0.05 ) model.add_transition( m3, model.end, 0.9 ) model.add_transition( m3, i3, 0.1 ) # Create transitions from insert states model.add_transition( i0, i0, 0.70 ) model.add_transition( i0, d1, 0.15 ) model.add_transition( i0, m1, 0.15 ) model.add_transition( i1, i1, 0.70 ) model.add_transition( i1, d2, 0.15 ) model.add_transition( i1, m2, 0.15 ) model.add_transition( i2, i2, 0.70 ) model.add_transition( i2, d3, 0.15 ) model.add_transition( i2, m3, 0.15 ) model.add_transition( i3, i3, 0.85 ) model.add_transition( i3, model.end, 0.15 ) # Create transitions from delete states model.add_transition( d1, d2, 0.15 ) model.add_transition( d1, i1, 0.15 ) model.add_transition( d1, m2, 0.70 ) model.add_transition( d2, d3, 0.15 ) model.add_transition( d2, i2, 0.15 ) model.add_transition( d2, m3, 0.70 ) model.add_transition( d3, i3, 0.30 ) model.add_transition( d3, model.end, 0.70 ) # Call bake to finalize the structure of the model. model.bake() for sequence in map( list, ('ACT', 'GGC', 'GAT', 'ACC') ): logp, path = model.viterbi( sequence ) print "Sequence: '{}' -- Log Probability: {} -- Path: {}".format( ''.join( sequence ), logp, " ".join( state.name for idx, state in path[1:-1] ) ) for sequence in map( list, ('A', 'GA', 'AC', 'AT', 'ATCC', 'ACGTG', 'ATTT', 'TACCCTC', 'TGTCAACACT') ): logp, path = model.viterbi( sequence ) print "Sequence: '{}' -- Log Probability: {} -- Path: {}".format( ''.join( sequence ), logp, " ".join( state.name for idx, state in path[1:-1] ) ) def path_to_alignment( x, y, path ): """ This function will take in two sequences, and the ML path which is their alignment, and insert dashes appropriately to make them appear aligned. This consists only of adding a dash to the model sequence for every insert in the path appropriately, and a dash in the observed sequence for every delete in the path appropriately. """ for i, (index, state) in enumerate( path[1:-1] ): name = state.name if name.startswith( 'D' ): y = y[:i] + '-' + y[i:] elif name.startswith( 'I' ): x = x[:i] + '-' + x[i:] return x, y for sequence in map( list, ('A', 'GA', 'AC', 'AT', 'ATCC' ) ): logp, path = model.viterbi( sequence ) x, y = path_to_alignment( 'ACT', ''.join(sequence), path ) print "Sequence: {}, Log Probability: {}".format( ''.join(sequence), logp ) print "{}\n{}".format( x, y ) print import itertools sequences = reduce( lambda x, y: x+y, [[ seq for seq in it.product( 'ACGT', repeat=i ) ] for i in xrange( 1,6 )] ) scores = map( model.log_probability, sequences ) import seaborn as sns %pylab inline plt.figure( figsize=(10,5) ) sns.kdeplot( numpy.array( scores ), shade=True ) plt.ylabel('Density') plt.xlabel('Log Probability') plt.show() plt.figure( figsize=(10,5) ) for i in xrange( 1, 6 ): subset = it.product( 'ACGT', repeat=i ) subset_scores = map( model.log_probability, subset ) sns.kdeplot( numpy.array( subset_scores ), color='rgbyc'[i-1], shade=True, label="Sequence Length " + str(i) ) plt.ylabel('Density') plt.xlabel('Log Probability') plt.legend() plt.figure( figsize=(10, 5) ) subsequences = it.product( 'ACT', repeat=3 ) scores = map( model.log_probability, subsequences ) sns.kdeplot( numpy.array( scores ), color='r', shade=True, label='Without G' ) subsequences = filter( lambda seq: 'G' in seq, it.product( 'ACGT', repeat=3 ) ) scores = map( model.log_probability, subsequences ) sns.kdeplot( numpy.array( scores ), color='b', shade=True, label='Including G') plt.ylabel('Density') plt.xlabel('Log Probability') plt.legend()