This is a continuation of the 02-17-14 activity. The original code is below and is followed by the new code for the 02-21-14 activities.
import collections
import numpy as np
import matplotlib.pyplot as plt
import re
import scipy.stats
STOP_SEQS = ('TAA', 'TAG', 'TGA')
# Estimate the probabilities of each character from the dataset.
data = open('data/chrIV.txt').read()
counts = collections.Counter(data)
print counts
total = sum(counts.itervalues())
probs = dict([(l, count / float(total)) for l, count in counts.items()])
print probs
Counter({'A': 476761, 'T': 474471, 'G': 291364, 'C': 289337}) {'A': 0.31121530771907124, 'C': 0.1888705315441341, 'T': 0.30972046427617916, 'G': 0.19019369646061546}
# Compute the probability that 3 random characters are a stop sequence.
def prob_seq(seq, probs):
"""Probability of a sequence of characters like 'TAA' assuming independence."""
return np.product([probs[c] for c in seq])
prob_stop_seq = sum(prob_seq(stop_seq, probs) for stop_seq in STOP_SEQS)
print 'Estimated probability of 3 random characters being a stop sequence:', prob_stop_seq
Estimated probability of 3 random characters being a stop sequence: 0.0666634111351
def split(string, offset):
"""Split a string into triples starting at the given offset."""
return [string[start:(start+3)] for start in range(offset, len(string), 3)]
test_data = "test data"
for i in xrange(3):
print split(test_data, i)
['tes', 't d', 'ata'] ['est', ' da', 'ta'] ['st ', 'dat', 'a']
# Get all possible ORFs.
def get_possible_orfs(data, offset):
"""Return a list of all possible ORFs (i.e., all strings between stop sequences) after
splitting the data into triples starting at the given offset."""
possible_orfs = []
orf_triples = []
for triple in split(data, offset):
if triple in STOP_SEQS:
possible_orfs.append(''.join(orf_triples))
orf_triples = []
else:
orf_triples.append(triple)
possible_orfs.append(''.join(orf_triples))
return possible_orfs
all_possible_orfs = []
for offset in xrange(3):
possible_orfs = get_possible_orfs(data, offset)
all_possible_orfs.extend(possible_orfs)
print len(all_possible_orfs), 'possible ORFs. First 10:'
for i in xrange(10):
print all_possible_orfs[i]
85727 possible ORFs. First 10: ACACCACACCCACACCACACCCACACACACCACACCCACACACCACACCCACACCCACACACCCACACCCACACACCACACACCACACCACACCACACCCACACCCACACCACACCCACACCCACACACCACACCCACACCCACACACCACACACTACCCCTAACACTACCCTATTC CCC TTTTACCTGTCTCCAAACCTACCCTCACATTACCCTACCTCCCCACTCGTTACCCTGCCCCACTCAACCATCCACTCCCAACCACCATCCATCTCTCTACTTACCACTAACCACCGTCCACCA CCGTTACCCTCCAATTACCCATATCCAACTCCACTACCACTTACCCTACTATTACCCTACCATCCACCATGTCCTACTCACTGTACTGTTGTTCTACCCTCCATATTGAAACGT CAAATGATCGTAAATAATACATACATACTTACCCTACCACTCTATACCACCACTACCACCACCGCCACTTGCCACACTCACCTTCACTTCTAC TATGTCATACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTTTCATATTCCACACCATGGCCCCATTCTCACTAAATCAGTACCAAATGCACTCACATCATTATTCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACGCA CTCCCACGATTATCCACATTTTATTACCTATATCTCATTCGGCGGCCCCAAATATTGTATAACTGCTCT TACATACGTTATACCACTTTTACACCGTATACTAACCACTCAATTTATATACACTTATGCCAATATTACAAAAAAATCACCACTAAAATCACC ACA AAATATTCTATTCTTCAACCA
# Get the CDF for a geometric distribution based on the probability of seeing a stop sequence.
cdf = scipy.stats.geom(prob_stop_seq).cdf
# Compute the p-values of each possible ORF.
probs = 1 - np.array([cdf(len(possible_orf)) for possible_orf in all_possible_orfs])
print probs[:10]
[ 4.97482250e-06 8.13045545e-01 2.06400075e-04 3.84030066e-04 1.63516321e-03 1.40024238e-05 8.56331878e-03 1.63516321e-03 8.13045545e-01 2.34857331e-01]
# Plot a histogram of the p-values.
plt.hist(probs, bins=100)
plt.title('Histogram of ORF p-values')
<matplotlib.text.Text at 0xc625a2c>
alpha = 0.05
n = len(probs)
alpha_prime = alpha / n
print alpha_prime
5.83246818389e-07
all_possible_orfs = np.array(all_possible_orfs)
filtered_bonferonni = all_possible_orfs[probs < alpha_prime]
print len(filtered_bonferonni), 'discoveries with Bonferonni'
1724 discoveries with Bonferonni
m = len(probs)
alpha = 0.05
sorted_indices = np.argsort(probs)
k = np.arange(1, m + 1)
thresholds = k * (alpha / m)
filtered_fdc = all_possible_orfs[sorted_indices][probs[sorted_indices] <= thresholds]
print len(filtered_fdc), 'discoveries with FDR'
19772 discoveries with FDR