#!/usr/bin/env python # coding: utf-8 # In[1]: import math from collections import defaultdict class MarkovChainOrder(object): ''' Simple higher-order Markov chain, specifically for DNA sequences. User provides training data (DNA strings). ''' def __init__(self, examples, order=1): ''' Initialize the model given collection of example DNA strings. ''' self.order = order self.mers = defaultdict(int) self.longestMer = longestMer = order + 1 for ex in examples: # count up all k-mers of length 'longestMer' or shorter for i in range(len(ex) - longestMer + 1): for j in range(longestMer, -1, -1): self.mers[ex[i:i+j]] += 1 def condProb(self, obs, given): ''' Return conditional probability of seeing nucleotide "obs" given we just saw nucleotide string "given". Length of "given" can't exceed model order. Return None if "given" was never observed. ''' assert len(given) <= self.order ngiven = self.mers[given] if ngiven == 0: return None return float(self.mers[given + obs]) / self.mers[given] def jointProb(self, s): ''' Return joint probability of observing string s ''' cum = 1.0 for i in range(self.longestMer-1, len(s)): obs, given = s[i], s[i-self.longestMer+1:i] p = self.condProb(obs, given) if p is not None: cum *= p # include the smaller k-mers at the very beginning for j in range(self.longestMer-2, -1, -1): obs, given = s[j], s[:j] p = self.condProb(obs, given) if p is not None: cum *= p return cum def jointProbL(self, s): ''' Return log2 of joint probability of observing string s ''' cum = 0.0 for i in range(self.longestMer-1, len(s)): obs, given = s[i], s[i-self.longestMer+1:i] p = self.condProb(obs, given) if p is not None: cum += math.log(p, 2) # include the smaller k-mers at the very beginning for j in range(self.longestMer-2, -1, -1): obs, given = s[j], s[:j] p = self.condProb(obs, given) if p is not None: cum += math.log(p, 2) return cum # In[2]: mc1 = MarkovChainOrder(['AC' * 10], 1) # In[3]: mc1.condProb('A', 'C') # should be 1; C always followed by A # In[4]: mc1.condProb('C', 'A') # should be 1; A always followed by C # In[5]: mc1.condProb('G', 'A') # should be 0; A occurs but is never followed by G # In[6]: mc2 = MarkovChainOrder(['AC' * 10], 2) # In[7]: mc2.condProb('A', 'AC') # AC always followed by A # In[8]: mc2.condProb('C', 'CA') # CA always followed by C # In[9]: mc2.condProb('C', 'AA') is None # because AA doesn't occur # In[10]: mc3 = MarkovChainOrder(['AAA1AAA2AAA2AAA3AAA3AAA3'], 3) # In[11]: mc3.condProb('2', 'AAA') # 1/3 # In[12]: mc3.condProb('3', 'AAA') # 1/2 # In[13]: p1 = mc3.condProb('A', '') p1 # In[14]: p2 = mc3.condProb('A', 'A') p2 # In[15]: p3 = mc3.condProb('A', 'AA') p3 # In[16]: p4 = mc3.condProb('1', 'AAA') p4 # In[17]: p1 * p2 * p3 * p4, mc3.jointProb('AAA1') # should be equal # In[18]: import math math.log(mc3.jointProb('AAA1'), 2), mc3.jointProbL('AAA1') # should be equal