import collections
import numpy as np
import matplotlib.pyplot as plt
import re
import scipy.stats
STOP_SEQS = ('TAA', 'TAG', 'TGA')
# Estimate the probabilities of each character from the dataset.
data = open('data/chrIV.txt').read()
counts = collections.Counter(data)
total = sum(counts.itervalues())
probs = dict([(l, count / float(total)) for l, count in counts.items()])
print probs
{'A': 0.31121530771907124, 'C': 0.1888705315441341, 'T': 0.30972046427617916, 'G': 0.19019369646061546}
# Compute the probability that 3 random characters are a stop sequence.
def prob_seq(seq, probs):
"""Probability of a sequence of characters like 'TAA' assuming independence."""
return np.product([probs[c] for c in seq])
prob_stop_seq = sum(prob_seq(stop_seq, probs) for stop_seq in STOP_SEQS)
print 'Estimated probability of 3 random characters being a stop sequence:', prob_stop_seq
Estimated probability of 3 random characters being a stop sequence: 0.0666634111351
def split(string, offset):
"""Split a string into triples starting at the given offset."""
return [string[start:(start+3)] for start in range(offset, len(string), 3)]
test_data = "test data"
for i in xrange(3):
print split(test_data, i)
['tes', 't d', 'ata'] ['est', ' da', 'ta'] ['st ', 'dat', 'a']
# Get all possible ORFs.
def get_possible_orfs(data, offset):
"""Return a list of all possible ORFs (i.e., all strings between stop sequences) after
splitting the data into triples starting at the given offset."""
possible_orfs = []
orf_triples = []
for triple in split(data, offset):
if triple in STOP_SEQS:
possible_orfs.append(''.join(orf_triples))
orf_triples = []
else:
orf_triples.append(triple)
possible_orfs.append(''.join(orf_triples))
return possible_orfs
all_possible_orfs = []
for offset in xrange(3):
possible_orfs = get_possible_orfs(data, offset)
all_possible_orfs.extend(possible_orfs)
print len(all_possible_orfs), 'possible ORFs. First 10:'
for i in xrange(10):
print all_possible_orfs[i]
85727 possible ORFs. First 10: ACACCACACCCACACCACACCCACACACACCACACCCACACACCACACCCACACCCACACACCCACACCCACACACCACACACCACACCACACCACACCCACACCCACACCACACCCACACCCACACACCACACCCACACCCACACACCACACACTACCCCTAACACTACCCTATTC CCC TTTTACCTGTCTCCAAACCTACCCTCACATTACCCTACCTCCCCACTCGTTACCCTGCCCCACTCAACCATCCACTCCCAACCACCATCCATCTCTCTACTTACCACTAACCACCGTCCACCA CCGTTACCCTCCAATTACCCATATCCAACTCCACTACCACTTACCCTACTATTACCCTACCATCCACCATGTCCTACTCACTGTACTGTTGTTCTACCCTCCATATTGAAACGT CAAATGATCGTAAATAATACATACATACTTACCCTACCACTCTATACCACCACTACCACCACCGCCACTTGCCACACTCACCTTCACTTCTAC TATGTCATACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTTTCATATTCCACACCATGGCCCCATTCTCACTAAATCAGTACCAAATGCACTCACATCATTATTCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACGCA CTCCCACGATTATCCACATTTTATTACCTATATCTCATTCGGCGGCCCCAAATATTGTATAACTGCTCT TACATACGTTATACCACTTTTACACCGTATACTAACCACTCAATTTATATACACTTATGCCAATATTACAAAAAAATCACCACTAAAATCACC ACA AAATATTCTATTCTTCAACCA
# Get the CDF for a geometric distribution based on the probability of seeing a stop sequence.
cdf = scipy.stats.geom(prob_stop_seq).cdf
# Compute the p-values of each possible ORF.
probs = 1 - np.array([cdf(len(possible_orf)) for possible_orf in all_possible_orfs])
print probs[:10]
[ 4.97482250e-06 8.13045545e-01 2.06400075e-04 3.84030066e-04 1.63516321e-03 1.40024238e-05 8.56331878e-03 1.63516321e-03 8.13045545e-01 2.34857331e-01]
# Plot a histogram of the p-values.
plt.hist(probs, bins=100)
plt.title('Stuff and things based on numbers')
<matplotlib.text.Text at 0xbee2a4c>