This is a continuation of the 02-17-14 activity. The original code is below and is followed by the new code for the 02-21-14 activities.

From 02-17-14¶

In [1]:

import collections
import numpy as np
import matplotlib.pyplot as plt
import re
import scipy.stats

In [2]:

STOP_SEQS = ('TAA', 'TAG', 'TGA')

In [3]:

# Estimate the probabilities of each character from the dataset.
data = open('data/chrIV.txt').read()
counts = collections.Counter(data)
print counts
total = sum(counts.itervalues())
probs = dict([(l, count / float(total)) for l, count in counts.items()])
print probs

Counter({'A': 476761, 'T': 474471, 'G': 291364, 'C': 289337})
{'A': 0.31121530771907124, 'C': 0.1888705315441341, 'T': 0.30972046427617916, 'G': 0.19019369646061546}

In [4]:

# Compute the probability that 3 random characters are a stop sequence.

def prob_seq(seq, probs):
    """Probability of a sequence of characters like 'TAA' assuming independence."""
    return np.product([probs[c] for c in seq])

prob_stop_seq = sum(prob_seq(stop_seq, probs) for stop_seq in STOP_SEQS)
print 'Estimated probability of 3 random characters being a stop sequence:', prob_stop_seq

Estimated probability of 3 random characters being a stop sequence: 0.0666634111351

In [5]:

def split(string, offset):
    """Split a string into triples starting at the given offset."""
    return [string[start:(start+3)] for start in range(offset, len(string), 3)]

test_data = "test data"
for i in xrange(3):
    print split(test_data, i)

['tes', 't d', 'ata']
['est', ' da', 'ta']
['st ', 'dat', 'a']

In [6]:

# Get all possible ORFs.

def get_possible_orfs(data, offset):
    """Return a list of all possible ORFs (i.e., all strings between stop sequences) after
    splitting the data into triples starting at the given offset."""
    possible_orfs = []
    orf_triples = []
    for triple in split(data, offset):
        if triple in STOP_SEQS:
            possible_orfs.append(''.join(orf_triples))
            orf_triples = []
        else:
            orf_triples.append(triple)
    possible_orfs.append(''.join(orf_triples))
    return possible_orfs

all_possible_orfs = []
for offset in xrange(3):
    possible_orfs = get_possible_orfs(data, offset)
    all_possible_orfs.extend(possible_orfs)
    
print len(all_possible_orfs), 'possible ORFs. First 10:'
for i in xrange(10):
    print all_possible_orfs[i]

85727 possible ORFs. First 10:
ACACCACACCCACACCACACCCACACACACCACACCCACACACCACACCCACACCCACACACCCACACCCACACACCACACACCACACCACACCACACCCACACCCACACCACACCCACACCCACACACCACACCCACACCCACACACCACACACTACCCCTAACACTACCCTATTC
CCC
TTTTACCTGTCTCCAAACCTACCCTCACATTACCCTACCTCCCCACTCGTTACCCTGCCCCACTCAACCATCCACTCCCAACCACCATCCATCTCTCTACTTACCACTAACCACCGTCCACCA
CCGTTACCCTCCAATTACCCATATCCAACTCCACTACCACTTACCCTACTATTACCCTACCATCCACCATGTCCTACTCACTGTACTGTTGTTCTACCCTCCATATTGAAACGT
CAAATGATCGTAAATAATACATACATACTTACCCTACCACTCTATACCACCACTACCACCACCGCCACTTGCCACACTCACCTTCACTTCTAC
TATGTCATACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTTTCATATTCCACACCATGGCCCCATTCTCACTAAATCAGTACCAAATGCACTCACATCATTATTCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACGCA
CTCCCACGATTATCCACATTTTATTACCTATATCTCATTCGGCGGCCCCAAATATTGTATAACTGCTCT
TACATACGTTATACCACTTTTACACCGTATACTAACCACTCAATTTATATACACTTATGCCAATATTACAAAAAAATCACCACTAAAATCACC
ACA
AAATATTCTATTCTTCAACCA

In [7]:

# Get the CDF for a geometric distribution based on the probability of seeing a stop sequence.
cdf = scipy.stats.geom(prob_stop_seq).cdf

In [8]:

# Compute the p-values of each possible ORF.
probs = 1 - np.array([cdf(len(possible_orf)) for possible_orf in all_possible_orfs])
print probs[:10]

[  4.97482250e-06   8.13045545e-01   2.06400075e-04   3.84030066e-04
   1.63516321e-03   1.40024238e-05   8.56331878e-03   1.63516321e-03
   8.13045545e-01   2.34857331e-01]

In [9]:

# Plot a histogram of the p-values.
plt.hist(probs, bins=100)
plt.title('Histogram of ORF p-values')

Out[9]:

<matplotlib.text.Text at 0xc625a2c>

From 02-21-14¶

Bonferonni, alpha = 5%¶

In [10]:

alpha = 0.05
n = len(probs)
alpha_prime = alpha / n 
print alpha_prime

5.83246818389e-07

In [11]:

all_possible_orfs = np.array(all_possible_orfs)
filtered_bonferonni = all_possible_orfs[probs < alpha_prime]
print len(filtered_bonferonni), 'discoveries with Bonferonni'

1724 discoveries with Bonferonni

FDR, alpha = 5%¶

In [12]:

m = len(probs)
alpha = 0.05

sorted_indices = np.argsort(probs)
k = np.arange(1, m + 1)
thresholds = k * (alpha / m)

filtered_fdc = all_possible_orfs[sorted_indices][probs[sorted_indices] <= thresholds]
print len(filtered_fdc), 'discoveries with FDR'

19772 discoveries with FDR