anhima.ped
- Pedigrees¶import numpy as np
np.random.seed(1)
import scipy.stats
import itertools
import random
random.seed(1)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
import anhima
# # dev imports
# sys.path.insert(0, '../src')
# %reload_ext autoreload
# %autoreload 1
# %aimport anhima.sim
# %aimport anhima.gt
# %aimport anhima.ped
# simulate parent diplotype
n_variants = 1000
p_alt = .5
parent_diplotype_truth = scipy.stats.bernoulli.rvs(p_alt, size=1000*2).reshape(1000, 2)
# introduce some missingness
p_missing = .03
n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs()
loc_missing = random.sample(range(n_variants), n_missing)
parent_diplotype = parent_diplotype_truth.copy()
parent_diplotype[loc_missing] = (-1, -1)
parent_diplotype
array([[ 0, 1], [ 0, 0], [-1, -1], ..., [ 1, 0], [ 0, 1], [ 0, 1]])
anhima.gt.count_called(parent_diplotype)
969
anhima.gt.count_hom_ref(parent_diplotype)
218
anhima.gt.count_het(parent_diplotype)
498
anhima.gt.count_hom_alt(parent_diplotype)
253
# simulate gamete haplotypes
n_gametes = 20
gamete_haplotypes = np.empty((n_variants, n_gametes), dtype='i1')
n_crossovers = scipy.stats.poisson.rvs(.8, size=n_gametes)
p_mendel_error = .03
for i in range(n_gametes):
# randomly choose which parent to start with
parent = scipy.stats.bernoulli(.5).rvs()
h = parent_diplotype_truth[:, parent].copy()
# simulate crossovers
loc_switches = sorted(np.random.randint(0, n_variants, size=n_crossovers[i]))
for l in loc_switches:
parent = 0 if parent == 1 else 1
h[l:] = parent_diplotype_truth[l:, parent]
# simulate errors
n_me = scipy.stats.binom(p=p_mendel_error, n=n_variants).rvs()
loc_me = random.sample(range(n_variants), n_me)
h[loc_me] = scipy.stats.bernoulli.rvs(.5, size=n_me)
# simulate missingness
n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs()
loc_missing = random.sample(range(n_variants), n_missing)
h[loc_missing] = -1
gamete_haplotypes[:, i] = h
gamete_haplotypes
array([[0, 0, 0, ..., 1, 1, 1], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [1, 1, 1, ..., 1, 1, 1], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=int8)
inh = anhima.ped.diploid_inheritance(parent_diplotype, gamete_haplotypes)
inh
array([[1, 1, 1, ..., 2, 2, 2], [3, 3, 3, ..., 3, 3, 3], [6, 6, 6, ..., 6, 6, 6], ..., [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)
inheritance_colors = ['red', # parent 1
'blue', # parent 2
'green', # parents both ref
'orange', # parents both alt
'black', # non-parental
'yellow', # parents missing
'white'] # missing
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh,
colors=inheritance_colors,
states=range(1, 8),
ax=ax);
inh_seg = inh[anhima.gt.is_het(parent_diplotype), :]
inh_seg.shape
(498, 20)
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg,
colors=inheritance_colors,
states=range(8),
ax=ax);
# simulate random variant positions
pos = np.asarray(random.sample(range(100000), inh_seg.shape[0]))
pos.sort()
pos_impute = [0] + random.sample(range(100000), 100) + [200000]
pos_impute = np.array(pos_impute)
pos_impute.sort()
pos_impute
array([ 0, 682, 1794, 3263, 3620, 4928, 4945, 5237, 6830, 6918, 7415, 7968, 8707, 9714, 12041, 13358, 13519, 13545, 13551, 14085, 14949, 16359, 17154, 18678, 19166, 19631, 20521, 20824, 21016, 22807, 23542, 24696, 27088, 30974, 31144, 31609, 31883, 32706, 34207, 34242, 35562, 35599, 35974, 36032, 36288, 36677, 36888, 40295, 42835, 43212, 43310, 45768, 47497, 48036, 50774, 51069, 51245, 53460, 53945, 54192, 54996, 55915, 56681, 57512, 58083, 60969, 61203, 61389, 62704, 62730, 64382, 64505, 66584, 67060, 67797, 68451, 69773, 71139, 72614, 76299, 78176, 83795, 84479, 84790, 85590, 85799, 86734, 87031, 87146, 87160, 88634, 89032, 90108, 90355, 91637, 94182, 94856, 96246, 96493, 98512, 98529, 200000])
inh_imputed = anhima.ped.impute_inheritance_nearest(inh_seg, pos, pos_impute)
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_imputed,
colors=inheritance_colors,
states=range(8),
ax=ax);