from scipy import __version__ as ver
print "Scipy", ver
from numpy import __version__ as ver
print "Numpy", ver
from matplotlib import __version__ as ver
print "Matplotlib", ver
from IPython import __version__ as ver
print "IPython", ver
from pandas import __version__ as ver
print "pandas", ver
from datetime import datetime
import os
rcParams['figure.figsize'] = (8,6)
rcParams['font.size'] = 14
rcParams['lines.linewidth'] = 2
pool = None
%load_ext ipycache
Scipy 0.12.0 Numpy 1.7.1 Matplotlib 1.3.0 IPython 2.0.0-dev pandas 0.12.0
dt = datetime.now()
print dt.ctime()
Wed Sep 3 16:41:14 2014
def simulate_fixation(s, N, x, PLOT=False):
N = int(N)
if PLOT:
_ = []
t = 0
while not (allclose(x,1) or allclose(x,0)):
assert 0 < x < 1
t += 1
if PLOT: _.append(x)
x = x/(1-s+s*x)
n = binomial(N,x)
x = n/float(N)
if PLOT:
if len(_)%100 == 0:
print len(_),
if PLOT:
plot(range(len(_)), _)
ylim(-0.1,1)
xlim(0,max(10,len(_)))
xlabel("Time")
ylabel("Frequency")
return allclose(x,1), t
while not simulate_fixation(0.1, 1e3, 1e-3, True)[0]:
pass
100
def p_simulation(s, N, x, replicates=100):
'''if os.path.exists(simulations_filename):
print "File found.",
df = pd.read_csv(simulations_filename)
else:
df = pd.DataFrame(columns=['type','s','N','x','fixation_success','fixation_time'])
df_slice = df[(df['type']=='simple_wf') & (df['s']==s) & (df['N']==N) & (df['x']==x)]
print "%d results found." % len(df_slice),
if len(df_slice) < replicates:
print "Running %d more simulations." % (replicates - len(df_slice)),'''
if pool:
results = pool.map_async(simulate_fixation, range(replicates), s=s, N=N, x=x, plot=False)
else:
results = [simulate_fixation(s, N, x, False) for _ in range(replicates)]# - len(df))]
d = [{'s':s,'N':N,'x':x,'type':'simple_wf','fixation_success':fixation_success,'fixation_time':fixation_time} for (fixation_success,fixation_time) in results]
d = pd.DataFrame(d)
#if len(df):
# print "Concatinating.",
# df = pd.concat((df, d), ignore_index=True)
#else:
df = d
#print "Writing new results to %s." % simulations_filename
#df.to_csv(simulations_filename, index=False)
df_slice = df#[(df['type']=='simple_wf') & (df['s']==s) & (df['N']==N) & (df['x']==x)]
return df_slice['fixation_success'].mean(), df_slice['fixation_success'].std(ddof=1)/sqrt(len(df_slice))
p_simulation(0.1, 1e5, 1e-5, 101)
(0.19801980198019803, 0.039850716430689319)
The probability that an indiviudal with a beneficial mutation that has fitness $1+s$ can be approximated by $p\approx 2s$. This approximation is good for a small $s$ and a large population size $N$. The approximation can be dated back to Haldane[@?] and Fisher[@?]. An accurate derivation, including many details on when the approximation is suitable, was given by Eshel[@198?].
def p_eshel(s,N,x):
return (2*s)**(N*x)
The above approximation fails for a small population, for which Kimura[@19?] provided a more accurate approximation, $p=\frac{1-e^{-2Nsx}}{1-e^{-2Ns}}$ where $x$ is the initial frequency of the beneficial mutant. For a population starting with a single beneficial mutant, this simplifies to $\frac{1-e^{-2s}}{1-e^{-2Ns}}$, for a large population ($N\to \infty$) to $1-e^{-2s}$, and for a small $s$, to $2s$.
def p_kimura(s,N,x):
return (1-exp(-2*N*s*x))/(1-exp(-2*N*s))
def plot_kimura_eshel_simulation(x, kimura=None, eshel=None, simulation=None, x_simulation=None, xlabel='', ylabel='p'):
fig,ax = subplots(1,1)
if kimura != None: ax.plot(x, kimura, label='Diffusion')
if eshel != None: ax.plot(x, eshel, label='1st Order')
if x_simulation == None: x_simulation = x
if simulation != None: ax.plot(x_simulation, simulation, 'og', alpha=0.5, label='Simulation')
ax.legend(loc=9)
ax.set_xscale('log')
ax.grid(False,'minor')
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_xlim(min(x)/2,max(x)*2)
low,high = ax.get_ylim()
ax.set_ylim(low-(high-low)/10, high+(high-low)/10)
return fig,ax
s = 0.001
N_range = array(map(int,logspace(3,7,50)))
p_eshel_N = p_eshel(s, N_range, 1./N_range)
p_kimura_N = p_kimura(s, N_range, 1./N_range)
%%cache p_simulation_N.pkl N_range_small p_simulation_N
N_range_small = array(map(int,logspace(3,7,5)))
p_simulation_N = [p_simulation(s, N, 1./N, replicates=100) for N in N_range_small]
plot_kimura_eshel_simulation(N_range, p_kimura_N, p_eshel_N, xlabel='N')#, p_simulation_N, 'N');
N = int(1e6)
s_range = logspace(-4,-1,50)
p_eshel_s = p_eshel(s_range, N, 1./N)
p_kimura_s = p_kimura(s_range, N, 1./N)
%%cache p_simulation_s.pkl s_range_small p_simulation_s
s_range_small = logspace(-4,-1,5)
p_simulation_s = [p_simulation(s, N, 1./N, replicates=100) for s in s_range_small]
Calculating... Saving results to p_simulation_s_N_1e6.gz
plot_kimura_eshel_simulation(s_range, p_kimura_s, p_eshel_s, xlabel='s')# p_simulation_s, 's');
The following formula is taken from eq. (17) in Kimura and Ohta 1969 which assumes a population size of 2N gametes and selection advantage of s/2 rather then N and s, therefore I'm changing $s=2s$ and $N=N/2$.
from scipy.integrate import quad as integral
def integrand1(s, N, x):
if x == 1:
return 0
return (1 - exp(-2*N*s*x) - exp(-2*N*s*(1-x)) + exp(-2*N*s)) / (x*(1-x))
def integrand2(s,N,x):
if x == 0:
return 0
return (exp(2*N*s*x) - 1) * (1 - exp(-2*N*s*x))/(x*(1-x))
def T_kimura(s,N,x):
J1 = 1/(s*(1 - exp(-2*N*s))) * integral(lambda t: integrand1(s,N,t), x, 1)[0]
u = (1-exp(-2*N*s*x))/(1-exp(-2*N*s))
J2 = 1/(s*(1-exp(-2*N*s))) * integral(lambda t: integrand2(s,N,t), 0, x)[0]
return J1 + ((1-u)/u) * J2
T_kimura(0.1, 1e6, 1e-6)
254.66622091258296
A different approach, attributed to Haldane 1924 (via Hartfiled & Otto 2011) is derived like this: the time-series of the frequency of the beneficial allele can be approximated by: $$ f(t) = \frac{f(0)}{f(0) + (1-f(0))(1-s)^t}. $$ Inversing this equation allows us to find the fixation time $T_{fix}$: $$ (1-s)^{T_{fix}} = \frac{(1-f(T_{fix}))f(T_0)}{f(T_{fix})(1-f(T_0))} \Rightarrow \\\\ T_{fix} = \frac{\log{ \frac{(1-f(T_{fix}))f(T_0)}{f(T_{fix})(1-f(T_0))} }}{\log{1-s}}. $$ Taking $f(0)=1/N, f(T_{fix})=\frac{N-1}{N}, \log{N-1} \approx \log{N}, \log{1-s} \approx -s$ we get: $$ T_{fix} = 2 \frac{\log{N}}{s} $$
def T_simulation(s,N,x,replicates=100):
if pool:
results = pool.map_async(simulate_fixation, range(replicates), s=s, N=N, x=x, plot=False)
else:
results = [simulate_fixation(s, N, x, False) for _ in range(replicates)]
return mean([t[1] for t in results if t[0]])
T_simulation(0.1, 1e5, 1e-5)
198.75
N_range = map(int,logspace(3,8,50))
s = 0.01
T_kimura_N = [T_kimura(s, N, 1./N) for N in N_range]
T_haldane_N = [-2*log(N)/log(1-s) for N in N_range]
%%cache T_simulation_N.pkl N_range_small T_simulation_N
N_range_small = map(int,logspace(3,8,5))
T_simulation_N = [T_simulation(s, N, 1./N, replicates=100) for N in N_range_small]
plot_kimura_eshel_simulation(N_range, T_kimura_N, T_haldane_N, xlabel='N', ylabel='T');
N = int(1e6)
s_range = logspace(-3,-1,50)
T_kimura_s = [T_kimura(s, N, 1./N) for s in s_range]
T_haldane_s = [2*log(N)/log(1+s) for s in s_range]
%%cache T_simulation_s.pkl s_range_small T_simulation_s
s_range_small = logspace(-3,-1,5)
T_simulation_s = [T_simulation(s, N, 1./N, replicates=100) for s in s_range_small]
plot_kimura_eshel_simulation(s_range, T_kimura_s, T_haldane_s, xlabel='s', ylabel='T');