import numpy as np np.random.seed(1) import scipy.stats import itertools import random random.seed(42) import matplotlib.pyplot as plt %matplotlib inline import sys # import anhima # dev imports sys.path.insert(0, '..') %reload_ext autoreload %autoreload 1 %aimport anhima.sim %aimport anhima.util %aimport anhima.gt %aimport anhima.ped # simulate parent diplotype n_variants = 1000 p_alt = .5 parent_diplotype_truth = scipy.stats.bernoulli.rvs(p_alt, size=1000*2).reshape(1000, 2) # introduce some missingness p_missing = .03 n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs() loc_missing = random.sample(range(n_variants), n_missing) parent_diplotype = parent_diplotype_truth.copy() parent_diplotype[loc_missing] = (-1, -1) parent_diplotype anhima.gt.count_called(parent_diplotype) anhima.gt.count_hom_ref(parent_diplotype) anhima.gt.count_het(parent_diplotype) anhima.gt.count_hom_alt(parent_diplotype) # simulate gamete haplotypes n_gametes = 20 gamete_haplotypes = np.empty((n_variants, n_gametes), dtype='i1') n_crossovers = scipy.stats.poisson.rvs(.8, size=n_gametes) p_mendel_error = .03 for i in range(n_gametes): # randomly choose which parent to start with parent = scipy.stats.bernoulli(.5).rvs() h = parent_diplotype_truth[:, parent].copy() # simulate crossovers loc_switches = sorted(np.random.randint(0, n_variants, size=n_crossovers[i])) for l in loc_switches: parent = 0 if parent == 1 else 1 h[l:] = parent_diplotype_truth[l:, parent] # simulate errors n_me = scipy.stats.binom(p=p_mendel_error, n=n_variants).rvs() loc_me = random.sample(range(n_variants), n_me) h[loc_me] = scipy.stats.bernoulli.rvs(.5, size=n_me) # simulate missingness n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs() loc_missing = random.sample(range(n_variants), n_missing) h[loc_missing] = -1 gamete_haplotypes[:, i] = h gamete_haplotypes inh = anhima.ped.diploid_inheritance(parent_diplotype, gamete_haplotypes) inh inheritance_colors = ['red', # parent 1 'blue', # parent 2 'green', # parents both ref 'orange', # parents both alt 'black', # non-parental 'yellow', # parents missing 'white'] # missing fig, ax = plt.subplots(figsize=(12, 8)) anhima.gt.plot_discrete_calldata(inh, colors=inheritance_colors, states=range(1, 8), ax=ax); inh_seg = inh[anhima.gt.is_het(parent_diplotype), :] inh_seg.shape fig, ax = plt.subplots(figsize=(12, 8)) anhima.gt.plot_discrete_calldata(inh_seg, colors=inheritance_colors, states=range(8), ax=ax) ax.set_xticks(np.arange(0, inh_seg.shape[0], 50)); # simulate random variant positions pos = np.asarray(random.sample(range(100000), inh_seg.shape[0])) pos.sort() pos_impute = [0] + random.sample(range(100000), 100) + [200000] pos_impute = np.array(pos_impute) pos_impute.sort() pos_impute inh_imputed = anhima.ped.impute_inheritance_nearest(inh_seg, pos, pos_impute) fig, ax = plt.subplots(figsize=(12, 8)) anhima.gt.plot_discrete_calldata(inh_imputed, colors=inheritance_colors, states=range(8), ax=ax); # consider first gamete only x = inh_seg[:, 0] fig, ax = plt.subplots(figsize=(14, 1)) anhima.gt.plot_discrete_calldata(x[:, None], colors=inheritance_colors, states=range(8), ax=ax) ax.set_xticks(np.arange(0, x.size, 50)); states = anhima.ped.INHERIT_PARENT1, anhima.ped.INHERIT_PARENT2 df_switches = anhima.util.tabulate_state_transitions(x, states, pos) df_switches.head() df_switches.tail() df_blocks = anhima.util.tabulate_state_blocks(x, states, pos) df_blocks.head() df_blocks.tail() df_all_switches = anhima.ped.tabulate_inheritance_switches(inh_seg, pos) df_all_switches.head() df_switches.equals(df_all_switches.loc[0]) df_all_blocks = anhima.ped.tabulate_inheritance_blocks(inh_seg, pos) df_all_blocks.head() df_blocks.equals(df_all_blocks.loc[0]) block_support, block_length_min = anhima.ped.inheritance_block_masks(inh_seg, pos) # filter using block support inh_seg_flt = inh_seg.copy() inh_seg_flt[block_support < 2] = anhima.ped.INHERIT_MISSING fig, ax = plt.subplots(figsize=(12, 8)) anhima.gt.plot_discrete_calldata(inh_seg_flt, colors=inheritance_colors, states=range(8), ax=ax) ax.set_xticks(np.arange(0, inh_seg.shape[0], 50)); anhima.util.tabulate_state_blocks(inh_seg_flt[:, 0], states, pos) # filter using minimal block length inh_seg_flt = inh_seg.copy() inh_seg_flt[block_length_min < 100] = anhima.ped.INHERIT_MISSING fig, ax = plt.subplots(figsize=(12, 8)) anhima.gt.plot_discrete_calldata(inh_seg_flt, colors=inheritance_colors, states=range(8), ax=ax) ax.set_xticks(np.arange(0, inh_seg.shape[0], 50)); anhima.util.tabulate_state_blocks(inh_seg_flt[:, 0], states, pos)