Fixation of a beneficial allele¶

In [14]:

from scipy import __version__ as ver
print "Scipy", ver
from numpy import __version__ as ver
print "Numpy", ver
from matplotlib import __version__ as ver
print "Matplotlib", ver
from IPython import __version__ as ver
print "IPython", ver
from pandas import __version__ as ver
print "pandas", ver
from datetime import datetime
import os

rcParams['figure.figsize'] = (8,6)
rcParams['font.size'] = 14
rcParams['lines.linewidth'] = 2

pool = None

%load_ext ipycache

Scipy 0.12.0
Numpy 1.7.1
Matplotlib 1.3.0
IPython 2.0.0-dev
pandas 0.12.0

In [2]:

dt = datetime.now()
print dt.ctime()

Wed Sep  3 16:41:14 2014

Wright-Fisher Simulation¶

In [3]:

def simulate_fixation(s, N, x, PLOT=False):
    N = int(N)
    if PLOT:
        _ = []
    t = 0
    while not (allclose(x,1) or allclose(x,0)):
        assert 0 < x < 1
        t += 1
        if PLOT: _.append(x)
        x = x/(1-s+s*x)
        n = binomial(N,x)
        x = n/float(N)
        if PLOT: 
            if len(_)%100 == 0:
                print len(_),
    if PLOT:
        plot(range(len(_)), _)
        ylim(-0.1,1)
        xlim(0,max(10,len(_)))
        xlabel("Time")
        ylabel("Frequency")
    return allclose(x,1), t

while not simulate_fixation(0.1, 1e3, 1e-3, True)[0]:
    pass

In [82]:

def p_simulation(s, N, x, replicates=100):
    '''if os.path.exists(simulations_filename):
        print "File found.",
        df = pd.read_csv(simulations_filename)
    else:
        df = pd.DataFrame(columns=['type','s','N','x','fixation_success','fixation_time'])
    df_slice = df[(df['type']=='simple_wf') & (df['s']==s) & (df['N']==N) & (df['x']==x)]
    print "%d results found." % len(df_slice),
    if len(df_slice) < replicates:
        print "Running %d more simulations." % (replicates - len(df_slice)),'''
    if pool:       
        results = pool.map_async(simulate_fixation, range(replicates), s=s, N=N, x=x, plot=False)
    else:
        results = [simulate_fixation(s, N, x, False) for _ in range(replicates)]# - len(df))]
    d = [{'s':s,'N':N,'x':x,'type':'simple_wf','fixation_success':fixation_success,'fixation_time':fixation_time} for (fixation_success,fixation_time) in results]       
    d = pd.DataFrame(d)
    #if len(df):
    #    print "Concatinating.",
    #    df = pd.concat((df, d), ignore_index=True)
    #else:
    df = d
    #print "Writing new results to %s." % simulations_filename
    #df.to_csv(simulations_filename, index=False)
    df_slice = df#[(df['type']=='simple_wf') & (df['s']==s) & (df['N']==N) & (df['x']==x)]
    return df_slice['fixation_success'].mean(), df_slice['fixation_success'].std(ddof=1)/sqrt(len(df_slice))
p_simulation(0.1, 1e5, 1e-5, 101)

Out[82]:

(0.19801980198019803, 0.039850716430689319)

Large population¶

The probability that an indiviudal with a beneficial mutation that has fitness $1+s$ can be approximated by $p\approx 2s$. This approximation is good for a small $s$ and a large population size $N$. The approximation can be dated back to Haldane[@?] and Fisher[@?]. An accurate derivation, including many details on when the approximation is suitable, was given by Eshel[@198?].

In [25]:

def p_eshel(s,N,x):
    return (2*s)**(N*x)

Small population¶

The above approximation fails for a small population, for which Kimura[@19?] provided a more accurate approximation, $p=\frac{1-e^{-2Nsx}}{1-e^{-2Ns}}$ where $x$ is the initial frequency of the beneficial mutant. For a population starting with a single beneficial mutant, this simplifies to $\frac{1-e^{-2s}}{1-e^{-2Ns}}$, for a large population ($N\to \infty$) to $1-e^{-2s}$, and for a small $s$, to $2s$.

In [45]:

def p_kimura(s,N,x):
    return (1-exp(-2*N*s*x))/(1-exp(-2*N*s))

Comparison¶

In [66]:

def plot_kimura_eshel_simulation(x, kimura=None, eshel=None, simulation=None, x_simulation=None, xlabel='', ylabel='p'):
    fig,ax = subplots(1,1)
    if kimura != None: ax.plot(x, kimura, label='Diffusion')
    if eshel != None: ax.plot(x, eshel, label='1st Order')
    if x_simulation == None: x_simulation = x
    if simulation != None: ax.plot(x_simulation, simulation, 'og', alpha=0.5, label='Simulation')
    ax.legend(loc=9)
    ax.set_xscale('log')
    ax.grid(False,'minor')
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xlim(min(x)/2,max(x)*2)
    low,high = ax.get_ylim()
    ax.set_ylim(low-(high-low)/10, high+(high-low)/10)
    return fig,ax

In [73]:

s = 0.001
N_range = array(map(int,logspace(3,7,50)))
p_eshel_N  = p_eshel(s, N_range, 1./N_range) 
p_kimura_N = p_kimura(s, N_range, 1./N_range)

In [52]:

%%cache p_simulation_N.pkl N_range_small p_simulation_N
N_range_small = array(map(int,logspace(3,7,5)))
p_simulation_N = [p_simulation(s, N, 1./N, replicates=100) for N in N_range_small]

In [74]:

plot_kimura_eshel_simulation(N_range, p_kimura_N, p_eshel_N, xlabel='N')#, p_simulation_N, 'N');

In [75]:

N = int(1e6)
s_range = logspace(-4,-1,50)
p_eshel_s  = p_eshel(s_range, N, 1./N)
p_kimura_s = p_kimura(s_range, N, 1./N)

In [248]:

%%cache p_simulation_s.pkl s_range_small p_simulation_s 
s_range_small = logspace(-4,-1,5)
p_simulation_s = [p_simulation(s, N, 1./N, replicates=100) for s in s_range_small]

Calculating... Saving results to p_simulation_s_N_1e6.gz

In [77]:

plot_kimura_eshel_simulation(s_range, p_kimura_s, p_eshel_s, xlabel='s')# p_simulation_s, 's');

Fixation time¶

Diffusion model¶

The following formula is taken from eq. (17) in Kimura and Ohta 1969 which assumes a population size of 2N gametes and selection advantage of s/2 rather then N and s, therefore I'm changing $s=2s$ and $N=N/2$.

In [78]:

from scipy.integrate import quad as integral

In [79]:

def integrand1(s, N, x):
    if x == 1:
        return 0
    return (1 - exp(-2*N*s*x) - exp(-2*N*s*(1-x)) + exp(-2*N*s)) / (x*(1-x))

def integrand2(s,N,x):
    if x == 0:
        return 0
    return (exp(2*N*s*x) - 1) * (1 - exp(-2*N*s*x))/(x*(1-x))

def T_kimura(s,N,x):
    J1 = 1/(s*(1 - exp(-2*N*s))) * integral(lambda t: integrand1(s,N,t), x, 1)[0]
    u = (1-exp(-2*N*s*x))/(1-exp(-2*N*s))
    J2 = 1/(s*(1-exp(-2*N*s))) * integral(lambda t: integrand2(s,N,t), 0, x)[0]
    return J1 + ((1-u)/u) * J2
T_kimura(0.1, 1e6, 1e-6)

Out[79]:

254.66622091258296

Determinisic model¶

A different approach, attributed to Haldane 1924 (via Hartfiled & Otto 2011) is derived like this: the time-series of the frequency of the beneficial allele can be approximated by: $$ f(t) = \frac{f(0)}{f(0) + (1-f(0))(1-s)^t}. $$ Inversing this equation allows us to find the fixation time $T_{fix}$: $$ (1-s)^{T_{fix}} = \frac{(1-f(T_{fix}))f(T_0)}{f(T_{fix})(1-f(T_0))} \Rightarrow \\\\ T_{fix} = \frac{\log{ \frac{(1-f(T_{fix}))f(T_0)}{f(T_{fix})(1-f(T_0))} }}{\log{1-s}}. $$ Taking $f(0)=1/N, f(T_{fix})=\frac{N-1}{N}, \log{N-1} \approx \log{N}, \log{1-s} \approx -s$ we get: $$ T_{fix} = 2 \frac{\log{N}}{s} $$

Simulation¶

In [81]:

def T_simulation(s,N,x,replicates=100):
    if pool:       
        results = pool.map_async(simulate_fixation, range(replicates), s=s, N=N, x=x, plot=False)
    else:
        results = [simulate_fixation(s, N, x, False) for _ in range(replicates)]
    return mean([t[1] for t in results if t[0]])
T_simulation(0.1, 1e5, 1e-5)

Out[81]:

198.75

Comparison¶

In [86]:

N_range = map(int,logspace(3,8,50))
s = 0.01
T_kimura_N    = [T_kimura(s, N, 1./N) for N in N_range]
T_haldane_N   = [-2*log(N)/log(1-s) for N in N_range]

In [ ]:

%%cache T_simulation_N.pkl N_range_small T_simulation_N
N_range_small = map(int,logspace(3,8,5))
T_simulation_N = [T_simulation(s, N, 1./N, replicates=100) for N in N_range_small]

In [87]:

plot_kimura_eshel_simulation(N_range, T_kimura_N, T_haldane_N, xlabel='N', ylabel='T');

In [88]:

N = int(1e6)
s_range = logspace(-3,-1,50)
T_kimura_s    = [T_kimura(s, N, 1./N) for s in s_range]
T_haldane_s   = [2*log(N)/log(1+s) for s in s_range]

In [72]:

%%cache T_simulation_s.pkl s_range_small T_simulation_s
s_range_small = logspace(-3,-1,5)
T_simulation_s = [T_simulation(s, N, 1./N, replicates=100) for s in s_range_small]

In [89]:

plot_kimura_eshel_simulation(s_range, T_kimura_s, T_haldane_s, xlabel='s', ylabel='T');

In [ ]: