from __future__ import division, print_function import numpy as np np.random.seed(42) import scipy.stats import random random.seed(42) import matplotlib.pyplot as plt %matplotlib inline import sys # import anhima # # dev imports sys.path.insert(0, '..') %reload_ext autoreload %autoreload 1 %aimport anhima.sim %aimport anhima.gt %aimport anhima.af # simulate genotypes n_variants = 1000 n_samples = 1000 ploidy = 2 af_dist = scipy.stats.beta(a=.4, b=.6) p_missing = .1 genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy) anhima.af.count_variant(genotypes) anhima.af.count_non_variant(genotypes) n_seg = anhima.af.count_segregating(genotypes) n_seg n_non_seg = anhima.af.count_non_segregating(genotypes) n_non_seg assert n_seg + n_non_seg == n_variants n_non_seg_ref = anhima.af.count_non_segregating(genotypes, allele=0) n_non_seg_ref n_non_seg_alt = anhima.af.count_non_segregating(genotypes, allele=1) n_non_seg_alt assert n_non_seg == n_non_seg_ref + n_non_seg_alt anhima.af.count_singletons(genotypes) anhima.af.count_doubletons(genotypes) an, ac, af = anhima.af.allele_frequencies(genotypes) # plot actual distribution of alt allele frequencies plt.hist(af[:, 1], bins=20); # simulate genotypes n_variants = 2000 n_samples = 200 ploidy = 2 af_dist = scipy.stats.beta(a=.4, b=.6) p_missing = 0.1 genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy) # simulate two sub-populations with relatedness genotypes_subpop1 = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//2], relatedness=.5, n_iter=7000, copy=False) genotypes_subpop2 = anhima.sim.simulate_relatedness(genotypes[:, n_samples//2:], relatedness=.5, n_iter=7000, copy=False) # check relatedness gn = anhima.gt.as_n_alt(genotypes) d, ds = anhima.dist.pairwise_distance(gn) anhima.dist.plot_pairwise_distance(ds); # calculate allele frequencies, leaving out 5 samples from each subpopulation _, _, af_subpop1 = anhima.af.allele_frequency(genotypes_subpop1[:, :(n_samples//2)-5]) _, _, af_subpop2 = anhima.af.allele_frequency(genotypes_subpop2[:, :(n_samples//2)-5]) # filter variants to use only those with power to differentiate between populations flt = np.abs(af_subpop1 - af_subpop2) > .5 genotypes_subpop1_flt = genotypes_subpop1[flt] genotypes_subpop2_flt = genotypes_subpop2[flt] # plot allele frequencies to visualise choice fig, ax = plt.subplots(figsize=(5, 5)) ax.plot(af_subpop1, af_subpop2, color='#cccccc', marker='o', linestyle=' ', mew=1, mfc='none'); af_subpop1_flt = af_subpop1[flt] af_subpop2_flt = af_subpop2[flt] ax.plot(af_subpop1_flt, af_subpop2_flt, 'bo', mew=0) ax.set_xlabel('allele frequency (subpop1)') ax.set_ylabel('allele frequency (subpop2)'); # plot allele frequencies in each subpopulation fig, ax = plt.subplots(figsize=(14, 3)) x = np.arange(af_subpop1_flt.size) + .5 ax.plot(x, af_subpop1_flt, 'bo-', label='subpop1') ax.plot(x, af_subpop2_flt, 'ro-', label='subpop2') ax.set_yticks([0, 1]) ax.set_xticks([]) ax.set_xlim(0, af_subpop1_flt.size) ax.set_ylabel('allele frequency') ax.legend(loc='upper left', bbox_to_anchor=(1.01, 1)); def investigate_ancestry(genotypes, af_subpop1, af_subpop2, filter_size=0, min_confidence=1): # predict ancestry ancestry, confidence = anhima.af.maximum_likelihood_ancestry(genotypes, af_subpop1, af_subpop2, filter_size=filter_size) # plot the ancestry predictions fig, ax = plt.subplots(figsize=(14, 2)) anhima.gt.plot_discrete_calldata(ancestry, colors='wbyr', states=(-1, 0, 1, 2), ax=ax) ax.set_title('ancestry') # plot the confidence fig, ax = plt.subplots(figsize=(14, 2)) anhima.gt.plot_continuous_calldata(confidence, ax=ax, pcolormesh_kwargs=dict(cmap='Greys', vmin=0, vmax=20)) ax.set_title('ancestry confidence') # plot distribution of confidence scores fig, ax = plt.subplots() ax.hist(confidence.flatten(), bins=20) ax.set_title('confidence') # plot ancestry filtered by confidence fig, ax = plt.subplots(figsize=(14, 2)) x = ancestry.copy() x[confidence < min_confidence] = -1 anhima.gt.plot_discrete_calldata(x, colors='wbyr', states=(-1, 0, 1, 2), ax=ax) ax.set_title('ancestry, filtered by confidence') # investigate ancestry in 5 samples from subpop1 we set aside earlier investigate_ancestry(genotypes_subpop1_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=0, min_confidence=1) # combine information from neighbouring variants investigate_ancestry(genotypes_subpop1_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=20, min_confidence=10) # investigate ancestry in 5 samples from subpop2 we set aside earlier investigate_ancestry(genotypes_subpop2_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=0, min_confidence=1) # combine information from neighbouring variants investigate_ancestry(genotypes_subpop2_flt[:, (n_samples//2)-5:], af_subpop1_flt, af_subpop2_flt, filter_size=20, min_confidence=10)