from __future__ import division, print_function
from collections import defaultdict
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
approx = defaultdict(int)
mafs = []
for fname in os.listdir('.'):
if not fname.startswith('tsi-') or not fname.endswith('.frq'):
continue
f = open(fname)
f.readline()
for cnt, l in enumerate(f):
toks = [tok for tok in l.rstrip().split(' ') if tok != '']
maf = float(toks[-2])
mafs.append(maf)
approx[maf] += 1
f.close()
print(len(mafs), type(mafs))
print(len(approx))
1222126 <type 'list'> 417
%time print(np.median(mafs))
0.2216 CPU times: user 60 ms, sys: 4 ms, total: 64 ms Wall time: 65.9 ms
fig, axs = plt.subplots(2, 2, sharex=True, figsize=(16, 9))
xs, ys = zip(*approx.items())
axs[0, 0].plot(xs, ys, '.')
xs = list(approx.keys())
xs.sort()
ys = [approx[x] for x in xs]
axs[0, 1].bar(xs, ys, 0.005)
def get_bins(my_dict, nbins):
acumulator = [0] * nbins
xmin, xmax = xs[0], xs[-1]
interval = (xmax - xmin) / nbins
bin_xs = [xmin + i * interval + interval / 2 for i in range(nbins)]
#the starting point, midpoint woudl take more interval / 2
curr_bin = 0
for x in xs:
y_cnt = approx[x]
while curr_bin + 1 != nbins and abs(x - bin_xs[curr_bin]) > abs(x - bin_xs[curr_bin + 1]):
curr_bin += 1
acumulator[curr_bin] += y_cnt
return bin_xs, acumulator, interval
bin_xs, acumulator, interval = get_bins(approx, 10)
axs[1, 0].bar(np.array(bin_xs) - interval / 2, acumulator, 0.5 / 10)
bin_xs, acumulator, interval = get_bins(approx, 20)
axs[1, 1].bar(np.array(bin_xs) - interval / 2, acumulator, 0.5 / 20)
axs[1, 1].set_xlim(0, 0.5)
fig.tight_layout()
def compute_median_from_dictionary(my_dict):
xs = list(my_dict.keys())
xs.sort()
x_cnt = [my_dict[x] for x in xs]
start = 0
end = len(xs) - 1
while start != end:
if start == end - 1 and x_cnt[start] == x_cnt[end]:
return (xs[start] + xs[end]) / 2
if x_cnt[start] > x_cnt[end]:
x_cnt[start] -= x_cnt[end]
end -= 1
elif x_cnt[start] < x_cnt[end]:
x_cnt[end] -= x_cnt[start]
start += 1
else:
start += 1
end -= 1
return xs[start]
print(compute_median_from_dictionary(approx))
0.2216
#do not put this, but discuss
for my_round in [1, 2, 3]:
my_approx = defaultdict(int)
for val, cnt in approx.items():
my_approx[round(val, my_round)] += cnt
print(len(my_approx), compute_median_from_dictionary(my_approx))
6 0.2 50 0.22 326 0.222
import random
import pandas as pd
arr = np.ndarray(shape=(5, 6), dtype=float)
samp_sizes = [1, 10, 100, 1000, 10000]
for rep in range(3):
for si, samp_size in enumerate([1, 10, 100, 1000, 10000]):
my_vals = random.sample(mafs, samp_size)
arr[si, rep] = np.median(my_vals)
arr[si, rep + 3] = max(my_vals)
df = pd.DataFrame(arr, index=samp_sizes,
columns=['Mean #1', 'Mean #2', 'Mean #3', 'Max #1', 'Max #2', 'Max #3'])
df
Mean #1 | Mean #2 | Mean #3 | Max #1 | Max #2 | Max #3 | |
---|---|---|---|---|---|---|
1 | 0.23300 | 0.3977 | 0.48860 | 0.2330 | 0.3977 | 0.4886 |
10 | 0.22160 | 0.1932 | 0.22725 | 0.3807 | 0.3693 | 0.3977 |
100 | 0.26420 | 0.1733 | 0.21590 | 0.4943 | 0.4886 | 0.4830 |
1000 | 0.22445 | 0.2159 | 0.22160 | 0.5000 | 0.5000 | 0.5000 |
10000 | 0.22160 | 0.2159 | 0.22160 | 0.5000 | 0.5000 | 0.5000 |
#bill gates effect