Notebook

In [1]:

import matplotlib.pyplot as plt
import numpy as np
import sympy

In [2]:

# For each factor, the number of mutations the number of generations from its ancestor factor.
# Used to calculate the probability of the data.
old_mutations_and_generations = [
        (0, 3),  # Samuel
        (0, 3),  # Willard
        (1, 5),  # T8
        (0, 5),  # T6
        (0, 6),  # T3
        (1, 11), # T4
        (3, 10)] # T5
new_mutations_and_generations = [
        (0, 1),  # Jacob: new factor
        (0, 2),  # Samuel: 3 -> 2
        (0, 3),  # Willard
        (1, 5),  # T8
        (0, 5),  # T6
        (0, 6),  # T3
        (1, 10), # T4: 11 -> 10
        (3, 10)] # T5

In [3]:

def get_prob_r_given_data(mutations_and_generations):
    """
    Returns a vectorized function that takes a value for r and returns P(r | data)
    computed using Bayes' rule. The 'mutations_and_generations' parameter is used
    to calculate the probability of the data.
    """
    
    r = sympy.symbols('r')
    
    p_data_given_r = 1
    for mutations, generations in mutations_and_generations:
        n = generations * 37
        k = mutations
        p_data_given_r *= sympy.binomial(n, k) * r**k * (1 - r)**(n - k)
        
    p_r = 1 / r
    p_data = sympy.integrate(p_data_given_r * p_r, (r, 0, 1))
    expr = (p_data_given_r * p_r) / p_data
    return np.vectorize(lambda r_val: expr.subs(r, r_val))

In [4]:

old_prob_r_func = get_prob_r_given_data(old_mutations_and_generations)
new_prob_r_func = get_prob_r_given_data(new_mutations_and_generations)

In [5]:

rs = np.linspace(1e-6, 0.01, num=1000)
plt.plot(rs, old_prob_r_func(rs), label='old')
plt.plot(rs, new_prob_r_func(rs), label='new')
plt.xlim(0, 0.01)
plt.legend()
plt.show()