from __future__ import division, print_function
import numpy as np
np.random.seed(1)
import scipy.stats
import random
random.seed(1)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
import anhima
# dev imports
# sys.path.insert(0, '../src')
# %reload_ext autoreload
# %autoreload 1
# %aimport anhima.sim
# %aimport anhima.gt
# %aimport anhima.af
# simulate genotypes
n_variants = 1000
n_samples = 1000
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = .1
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
anhima.af.count_variant(genotypes)
962
anhima.af.count_non_variant(genotypes)
38
n_seg = anhima.af.count_segregating(genotypes)
n_seg
958
n_non_seg = anhima.af.count_non_segregating(genotypes)
n_non_seg
42
assert n_seg + n_non_seg == n_variants
n_non_seg_ref = anhima.af.count_non_segregating(genotypes, allele=0)
n_non_seg_ref
38
n_non_seg_alt = anhima.af.count_non_segregating(genotypes, allele=1)
n_non_seg_alt
4
assert n_non_seg == n_non_seg_ref + n_non_seg_alt
anhima.af.count_singletons(genotypes)
12
anhima.af.count_doubletons(genotypes)
9
an, ac, af = anhima.af.allele_frequencies(genotypes)
# plot actual distribution of alt allele frequencies
plt.hist(af[:, 1], bins=20);
n_samples = 100
ploidy = 2
m = n_samples * ploidy
# simulated allele counts for population 1 - constant size
pop1_derived_af = scipy.stats.reciprocal.rvs(a=.001, b=1, size=1000000)
pop1_derived_ac = np.round(pop1_derived_af * m).astype('i4')
pop1_biallelic_ac = np.column_stack([m - pop1_derived_ac, pop1_derived_ac])
# simulated allele counts for population 2 - deficit of rare variants (population bottleneck)
pop2_derived_af = scipy.stats.beta.rvs(a=.2, b=1.1, size=1000000)
pop2_derived_ac = np.round(pop2_derived_af * m).astype('i4')
pop2_biallelic_ac = np.column_stack([m - pop2_derived_ac, pop2_derived_ac])
# simulated allele counts for population 3 - excess of rare variants
pop3_derived_af = np.append(pop1_derived_af,
scipy.stats.beta.rvs(a=.1, b=6, size=1000000))
pop3_derived_ac = np.round(pop3_derived_af * m).astype('i4')
pop3_biallelic_ac = np.column_stack([m - pop3_derived_ac, pop3_derived_ac])
pop1_sfs = anhima.af.site_frequency_spectrum(pop1_derived_ac)
pop2_sfs = anhima.af.site_frequency_spectrum(pop2_derived_ac)
pop3_sfs = anhima.af.site_frequency_spectrum(pop3_derived_ac)
fig, ax = plt.subplots()
anhima.af.plot_site_frequency_spectrum(pop1_sfs, ax=ax, label='pop1', plot_kwargs=dict(lw=2))
anhima.af.plot_site_frequency_spectrum(pop2_sfs, ax=ax, label='pop2', plot_kwargs=dict(lw=2))
anhima.af.plot_site_frequency_spectrum(pop3_sfs, ax=ax, label='pop3', plot_kwargs=dict(lw=2))
ax.set_xlabel('derived allele count')
ax.legend();
pop1_sfs_scaled = anhima.af.site_frequency_spectrum_scaled(pop1_derived_ac)
pop2_sfs_scaled = anhima.af.site_frequency_spectrum_scaled(pop2_derived_ac)
pop3_sfs_scaled = anhima.af.site_frequency_spectrum_scaled(pop3_derived_ac)
fig, ax = plt.subplots()
anhima.af.plot_site_frequency_spectrum(pop1_sfs_scaled, ax=ax, label='pop1', plot_kwargs=dict(lw=2))
anhima.af.plot_site_frequency_spectrum(pop2_sfs_scaled, ax=ax, label='pop2', plot_kwargs=dict(lw=2))
anhima.af.plot_site_frequency_spectrum(pop3_sfs_scaled, ax=ax, label='pop3', plot_kwargs=dict(lw=2))
ax.set_ylim(bottom=0)
ax.set_ylabel('scaled site frequency')
ax.set_xlabel('derived allele count')
ax.legend(bbox_to_anchor=(1, 1), loc='upper left');
pop1_sfs_folded = anhima.af.site_frequency_spectrum_folded(pop1_biallelic_ac)
pop2_sfs_folded = anhima.af.site_frequency_spectrum_folded(pop2_biallelic_ac)
pop3_sfs_folded = anhima.af.site_frequency_spectrum_folded(pop3_biallelic_ac)
fig, ax = plt.subplots()
anhima.af.plot_site_frequency_spectrum(pop1_sfs_folded, ax=ax, label='pop1', plot_kwargs=dict(lw=2))
anhima.af.plot_site_frequency_spectrum(pop2_sfs_folded, ax=ax, label='pop2', plot_kwargs=dict(lw=2))
anhima.af.plot_site_frequency_spectrum(pop3_sfs_folded, ax=ax, label='pop3', plot_kwargs=dict(lw=2))
ax.set_xlabel('minor allele count')
ax.legend();
pop1_sfs_folded_scaled = anhima.af.site_frequency_spectrum_folded_scaled(pop1_biallelic_ac)
pop2_sfs_folded_scaled = anhima.af.site_frequency_spectrum_folded_scaled(pop2_biallelic_ac)
pop3_sfs_folded_scaled = anhima.af.site_frequency_spectrum_folded_scaled(pop3_biallelic_ac)
fig, ax = plt.subplots()
anhima.af.plot_site_frequency_spectrum(pop1_sfs_folded_scaled, ax=ax, label='pop1', plot_kwargs=dict(lw=2), m=m)
anhima.af.plot_site_frequency_spectrum(pop2_sfs_folded_scaled, ax=ax, label='pop2', plot_kwargs=dict(lw=2), m=m)
anhima.af.plot_site_frequency_spectrum(pop3_sfs_folded_scaled, ax=ax, label='pop3', plot_kwargs=dict(lw=2), m=m)
ax.set_ylim(bottom=0)
ax.set_ylabel('scaled site frequency')
ax.set_xlabel('minor allele frequency')
ax.legend(bbox_to_anchor=(1, 1), loc='upper left');
# simulate genotypes
n_variants = 10000
n_samples = 100
ploidy = 2
af_dist = scipy.stats.beta(a=.1, b=6)
p_missing = 0
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
# simulate three sub-populations with relatedness
pop1_genotypes = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//3], relatedness=.5, n_iter=100)
pop23_genotypes = anhima.sim.simulate_relatedness(genotypes[:, n_samples//3:], relatedness=.5, n_iter=100)
pop2_genotypes = anhima.sim.simulate_relatedness(pop23_genotypes[:, n_samples//3:], relatedness=.5, n_iter=50)
pop3_genotypes = anhima.sim.simulate_relatedness(pop23_genotypes[:, :n_samples//3], relatedness=.5, n_iter=50)
pop1_ac = anhima.af.allele_count(pop1_genotypes, allele=1)
pop2_ac = anhima.af.allele_count(pop2_genotypes, allele=1)
pop3_ac = anhima.af.allele_count(pop3_genotypes, allele=1)
subpops_ac = np.column_stack([pop1_ac, pop2_ac, pop3_ac])
dbl_counts = anhima.af.count_shared_doubletons(subpops_ac)
dbl_counts
array([[81, 18, 16], [18, 52, 36], [16, 36, 66]])
anhima.af.plot_shared_doubletons_heatmap(dbl_counts,
subpop_labels=['pop1', 'pop2', 'pop3'],
text_kwargs=dict(fontsize=12),
color_diagonal=False);
anhima.af.plot_shared_doubletons_heatmap(dbl_counts,
subpop_labels=['pop1', 'pop2', 'pop3'],
text_kwargs=dict(fontsize=12),
color_diagonal=True);
anhima.af.plot_shared_doubletons_bar(dbl_counts,
figsize_factor=2,
subpop_labels=['pop1', 'pop2', 'pop3']);