import numpy as np
np.random.seed(1)
import random
random.seed(1)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
import anhima
# dev imports
# sys.path.insert(0, '../src')
# %load_ext autoreload
# %autoreload 1
# %aimport anhima.sim
# %aimport anhima.ld
# %aimport anhima.loc
Simulate genotypes with some degree of linkage disequilibrium.
n_variants = 1000
n_samples = 100
correlation = .97
g = anhima.sim.simulate_genotypes_with_ld(n_variants, n_samples, correlation)
g
array([[1, 0, 0, ..., 1, 2, 2], [1, 2, 2, ..., 1, 0, 0], [1, 2, 2, ..., 1, 0, 0], ..., [2, 0, 0, ..., 2, 1, 2], [2, 0, 0, ..., 2, 1, 0], [0, 2, 2, ..., 0, 1, 2]], dtype=int8)
Simulate the genomic physical positions of the variants.
pos = np.random.randint(low=0, high=n_variants*100, size=n_variants)
pos.sort()
Calculate an approximate value of LD between all pairs of variants.
r_squared = anhima.ld.pairwise_genotype_ld(g)
r_squared
array([[ 1. , 0.86213219, 0.84518672, ..., 0.00151991, 0.0024691 , 0.00424539], [ 0.86213219, 1. , 0.98426364, ..., 0.00124198, 0.0023271 , 0.00409844], [ 0.84518672, 0.98426364, 1. , ..., 0.00134478, 0.00239841, 0.00419007], ..., [ 0.00151991, 0.00124198, 0.00134478, ..., 1. , 0.86652792, 0.85228717], [ 0.0024691 , 0.0023271 , 0.00239841, ..., 0.86652792, 1. , 0.9845216 ], [ 0.00424539, 0.00409844, 0.00419007, ..., 0.85228717, 0.9845216 , 1. ]])
anhima.ld.plot_pairwise_ld(r_squared);
# zoom in to first 100 variants
anhima.ld.plot_pairwise_ld(r_squared[:100, :100]);
cor, sep, dist = anhima.ld.pairwise_ld_decay(r_squared, pos)
cor, sep, dist
(array([ 0.86213219, 0.84518672, 0.77356764, ..., 0.86652792, 0.85228717, 0.9845216 ]), array([1, 2, 3, ..., 1, 2, 1]), array([ 86, 213, 221, ..., 123, 129, 6]))
plt.hist(cor[sep == 1]);
plt.hist(cor[(dist > 0) & (dist < 200)]);
np.sqrt(np.mean(cor[sep == 1]))
0.96976577679882858
anhima.ld.plot_ld_decay_by_separation(cor, sep);
anhima.ld.plot_ld_decay_by_distance(cor, dist, bins=np.arange(0, 10000, 200));
anhima.ld.plot_windowed_ld(g, pos, window_size=10000);
anhima.ld.plot_windowed_ld(g, pos, window_size=1000);
included = anhima.ld.ld_prune_pairwise(g)
Indices of the variants that have been retained.
np.nonzero(included)[0]
array([ 0, 25, 58, 90, 110, 137, 161, 180, 216, 250, 280, 300, 334, 365, 393, 413, 443, 459, 483, 516, 556, 578, 612, 645, 668, 697, 720, 755, 776, 816, 836, 854, 883, 914, 940, 961, 997])
g_pruned = np.compress(included, g, axis=0)
g_pruned
array([[1, 0, 0, ..., 1, 2, 2], [1, 1, 2, ..., 1, 0, 1], [1, 0, 0, ..., 1, 1, 0], ..., [1, 1, 2, ..., 2, 1, 2], [1, 0, 1, ..., 2, 1, 2], [2, 0, 0, ..., 2, 1, 2]], dtype=int8)
r_squared_pruned = anhima.ld.pairwise_genotype_ld(g_pruned)
anhima.ld.plot_pairwise_ld(r_squared_pruned);
pos_pruned = pos[included]
cor_pruned, sep_pruned, dist_pruned = anhima.ld.pairwise_ld_decay(r_squared_pruned, pos_pruned)
anhima.ld.plot_ld_decay_by_separation(cor_pruned, sep_pruned);
anhima.ld.plot_ld_decay_by_distance(cor_pruned, dist_pruned, bins=np.arange(0, 10000, 400));
n_variants_big = 100000
g_big = anhima.sim.simulate_genotypes_with_ld(n_variants_big, n_samples, correlation=.97)
pos_big = np.random.randint(low=0, high=n_variants_big*100, size=n_variants_big)
pos_big.sort()
included_big = anhima.ld.ld_prune_pairwise(g_big)
np.count_nonzero(included_big)
3680
g_big_pruned = np.compress(included_big, g_big, axis=0)
r_squared_big_pruned = anhima.ld.pairwise_genotype_ld(g_big_pruned)
anhima.ld.plot_pairwise_ld(r_squared_big_pruned[:100, :100]);
cor_big, sep_big, dist_big = anhima.ld.windowed_ld_decay(g_big, pos_big, window_size=1000, step=10)
anhima.ld.plot_ld_decay_by_separation(cor_big, sep_big, max_separation=100);
anhima.ld.plot_ld_decay_by_distance(cor_big, dist_big, bins=np.arange(0, 10000, 200));
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=100000);
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=10000);