import numpy as np np.random.seed(42) import scipy.stats import random random.seed(42) import matplotlib.pyplot as plt %matplotlib inline import sys import itertools # import logging # logger = logging.getLogger() # logger.setLevel(logging.DEBUG) # debug = logger.debug # import anhima # dev imports sys.path.insert(0, '..') %reload_ext autoreload %autoreload 1 %aimport anhima.tree %aimport anhima.sim %aimport anhima.gt %aimport anhima.dist # simulate genotypes with no relatedness n_variants = 1000 n_samples = 100 samples = [''.join(s) for s in itertools.product('ABCDEFGHIJ', repeat=2)] ploidy = 2 af_dist = scipy.stats.beta(a=.4, b=.6) p_missing = 0 genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy) gn = anhima.gt.as_n_alt(genotypes) # simulate three sub-populations with relatedness genotypes_subpop1 = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//2], relatedness=.5, n_iter=1000) genotypes_subpop2 = anhima.sim.simulate_relatedness(genotypes[:, n_samples//2:], relatedness=.5, n_iter=1000) genotypes_subpop2a = anhima.sim.simulate_relatedness(genotypes_subpop2[:, n_samples//4:], relatedness=.5, n_iter=500) genotypes_subpop2b = anhima.sim.simulate_relatedness(genotypes_subpop2[:, :n_samples//4], relatedness=.5, n_iter=500) genotypes_related = np.concatenate([genotypes_subpop1, genotypes_subpop2a, genotypes_subpop2b], axis=1) gnr = anhima.gt.as_n_alt(genotypes_related) # calculate pairwise distance d, s = anhima.dist.pairwise_distance(gn, metric='euclidean') dr, sr = anhima.dist.pairwise_distance(gnr, metric='euclidean') # build a neighbour joining tree t = anhima.tree.nj(s, labels=samples) tr = anhima.tree.nj(sr, labels=samples) anhima.tree.plot_phylo(t, plot_kwargs={'type': 'unrooted', 'tip.color': 'black', 'edge.color': 'gray', 'lab4ut': 'axial'}); # first 50 samples are in pop1, second 25 are in pop2a, third 25 are in pop 2b populations = (['pop1'] * 50) + (['pop2a']*25) + (['pop2b']*25) # choose a color for each sub-population population_colors = { 'pop1': 'red', 'pop2a': 'blue', 'pop2b': 'green', } # assign a color to each sample based on population membership sample_colors = [population_colors[p] for p in populations] # assign a color to each tree edge based on population majority edge_colors = anhima.tree.color_edges_by_group_majority(tr, labels=samples, groups=populations, colors=population_colors) # now plot the tree anhima.tree.plot_phylo(tr, plot_kwargs={'type': 'unrooted', 'tip.color': sample_colors, 'edge.color': edge_colors, 'lab4ut': 'axial'}); metrics = 'euclidean', 'cityblock' fig = plt.figure(figsize=(len(metrics)*5, 5)) for i, metric in enumerate(metrics): ax = fig.add_subplot(1, len(metrics), i+1) ax.set_title(metric) d, s = anhima.dist.pairwise_distance(gnr, metric=metric) t = anhima.tree.nj(s, labels=samples) edge_colors = anhima.tree.color_edges_by_group_majority(t, labels=samples, groups=populations, colors=population_colors) anhima.tree.plot_phylo(t, plot_kwargs={'type': 'unrooted', 'tip.color': sample_colors, 'edge.color': edge_colors, 'lab4ut': 'axial'}, width=5, height=5, units='in', res=plt.rcParams['savefig.dpi'], add_scale_bar={}, ax=ax) fig.subplots_adjust(0, 0, 1, 1, hspace=0, wspace=0) d, s = anhima.dist.pairwise_distance(gnr, metric=metric) t = anhima.tree.nj(s, labels=samples) # write to string anhima.tree.write_tree(t) # write to file anhima.tree.write_tree(t, '/tmp/tree.newick') # read from file t2 = anhima.tree.read_tree('/tmp/tree.newick') anhima.tree.plot_phylo(t2, plot_kwargs={'type': 'unrooted'}); # N.B., labels will not be preserved in original order list(t2[2])[:10]