In [1]:

import collections
import numpy as np
import matplotlib.pyplot as plt
import re
import scipy.stats

In [2]:

STOP_SEQS = ('TAA', 'TAG', 'TGA')

In [3]:

# Estimate the probabilities of each character from the dataset.
data = open('data/chrIV.txt').read()
counts = collections.Counter(data)
total = sum(counts.itervalues())
probs = dict([(l, count / float(total)) for l, count in counts.items()])
print probs

{'A': 0.31121530771907124, 'C': 0.1888705315441341, 'T': 0.30972046427617916, 'G': 0.19019369646061546}

In [4]:

# Compute the probability that 3 random characters are a stop sequence.

def prob_seq(seq, probs):
    """Probability of a sequence of characters like 'TAA' assuming independence."""
    return np.product([probs[c] for c in seq])

prob_stop_seq = sum(prob_seq(stop_seq, probs) for stop_seq in STOP_SEQS)
print 'Estimated probability of 3 random characters being a stop sequence:', prob_stop_seq

Estimated probability of 3 random characters being a stop sequence: 0.0666634111351

In [5]:

def split(string, offset):
    """Split a string into triples starting at the given offset."""
    return [string[start:(start+3)] for start in range(offset, len(string), 3)]

test_data = "test data"
for i in xrange(3):
    print split(test_data, i)

['tes', 't d', 'ata']
['est', ' da', 'ta']
['st ', 'dat', 'a']

In [6]:

# Get all possible ORFs.

def get_possible_orfs(data, offset):
    """Return a list of all possible ORFs (i.e., all strings between stop sequences) after
    splitting the data into triples starting at the given offset."""
    possible_orfs = []
    orf_triples = []
    for triple in split(data, offset):
        if triple in STOP_SEQS:
            possible_orfs.append(''.join(orf_triples))
            orf_triples = []
        else:
            orf_triples.append(triple)
    possible_orfs.append(''.join(orf_triples))
    return possible_orfs

all_possible_orfs = []
for offset in xrange(3):
    possible_orfs = get_possible_orfs(data, offset)
    all_possible_orfs.extend(possible_orfs)
    
print len(all_possible_orfs), 'possible ORFs. First 10:'
for i in xrange(10):
    print all_possible_orfs[i]

85727 possible ORFs. First 10:
ACACCACACCCACACCACACCCACACACACCACACCCACACACCACACCCACACCCACACACCCACACCCACACACCACACACCACACCACACCACACCCACACCCACACCACACCCACACCCACACACCACACCCACACCCACACACCACACACTACCCCTAACACTACCCTATTC
CCC
TTTTACCTGTCTCCAAACCTACCCTCACATTACCCTACCTCCCCACTCGTTACCCTGCCCCACTCAACCATCCACTCCCAACCACCATCCATCTCTCTACTTACCACTAACCACCGTCCACCA
CCGTTACCCTCCAATTACCCATATCCAACTCCACTACCACTTACCCTACTATTACCCTACCATCCACCATGTCCTACTCACTGTACTGTTGTTCTACCCTCCATATTGAAACGT
CAAATGATCGTAAATAATACATACATACTTACCCTACCACTCTATACCACCACTACCACCACCGCCACTTGCCACACTCACCTTCACTTCTAC
TATGTCATACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTTTCATATTCCACACCATGGCCCCATTCTCACTAAATCAGTACCAAATGCACTCACATCATTATTCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACGCA
CTCCCACGATTATCCACATTTTATTACCTATATCTCATTCGGCGGCCCCAAATATTGTATAACTGCTCT
TACATACGTTATACCACTTTTACACCGTATACTAACCACTCAATTTATATACACTTATGCCAATATTACAAAAAAATCACCACTAAAATCACC
ACA
AAATATTCTATTCTTCAACCA

In [7]:

# Get the CDF for a geometric distribution based on the probability of seeing a stop sequence.
cdf = scipy.stats.geom(prob_stop_seq).cdf

In [8]:

# Compute the p-values of each possible ORF.
probs = 1 - np.array([cdf(len(possible_orf)) for possible_orf in all_possible_orfs])
print probs[:10]

[  4.97482250e-06   8.13045545e-01   2.06400075e-04   3.84030066e-04
   1.63516321e-03   1.40024238e-05   8.56331878e-03   1.63516321e-03
   8.13045545e-01   2.34857331e-01]

In [9]:

# Plot a histogram of the p-values.
plt.hist(probs, bins=100)
plt.title('Stuff and things based on numbers')

Out[9]:

<matplotlib.text.Text at 0xbee2a4c>