import matplotlib.pyplot as plt
import numpy as np
import pandas
def get_num_false_discoveries(m, alpha):
"""Return the number of false discoveries when applying the Benjamini-Hochberg
prescription with `alpha` to `m` uniformly distributed p-values."""
pvals = np.random.random(m)
pvals.sort()
k = np.arange(1, len(pvals) + 1)
thresholds = k * (alpha / m)
return np.sum(pvals <= thresholds)
def estimate_probs(m=50, alpha=0.05, num_simulations=10**5):
"""Estimate the probability of getting N=0, 1, 2, and 3 false discoveries
when applying the Benjamini-Hochberg prescription with `alpha` to `m` uniformly
distributed p-values."""
counts = np.zeros(4)
for _ in xrange(num_simulations):
num_discoveries = get_num_false_discoveries(m, alpha)
if num_discoveries < len(counts):
counts[num_discoveries] += 1
probs = counts / counts.sum()
return probs
for count, prob in enumerate(estimate_probs()):
print 'P({0}) = {1}'.format(count, prob)
P(0) = 0.94991 P(1) = 0.04654 P(2) = 0.00327 P(3) = 0.00028
def collect_results(ms, alphas):
"""Collect probabilities of false discoveries for every combination of ms and alphas."""
results = []
for m in ms:
for alpha in alphas:
result = [m, alpha]
result.extend(estimate_probs(m=m, alpha=alpha))
results.append(result)
return pandas.DataFrame(results, columns=['m', 'alpha', 'p0', 'p1', 'p2', 'p3'])
def plot_results(results, x_var, logx=True):
fig, axs = plt.subplots(2, 2, figsize=(6, 6))
for num_discoveries in xrange(4):
row = num_discoveries // 2
col = num_discoveries % 2
x_axis = results[x_var]
probs = results['p{0}'.format(num_discoveries)]
axs[row][col].plot(x_axis, probs, marker='o')
axs[row][col].set_xscale('log')
axs[row][col].set_title('p{0}'.format(num_discoveries))
vary_alpha_results = collect_results([50], [0.0001, 0.001, 0.01, 0.1, 0.5])
vary_alpha_results
m | alpha | p0 | p1 | p2 | p3 | |
---|---|---|---|---|---|---|
0 | 50 | 0.0001 | 0.999910 | 0.000090 | 0.000000 | 0.000000 |
1 | 50 | 0.0010 | 0.999080 | 0.000920 | 0.000000 | 0.000000 |
2 | 50 | 0.0100 | 0.990490 | 0.009420 | 0.000090 | 0.000000 |
3 | 50 | 0.1000 | 0.898847 | 0.087688 | 0.011555 | 0.001911 |
4 | 50 | 0.5000 | 0.571245 | 0.232542 | 0.123220 | 0.072992 |
5 rows × 6 columns
plot_results(vary_alpha_results, 'alpha')
vary_m_results = collect_results([5, 50, 500, 5000, 50000], [0.05])
vary_m_results
m | alpha | p0 | p1 | p2 | p3 | |
---|---|---|---|---|---|---|
0 | 5 | 0.05 | 0.949959 | 0.047080 | 0.00289 | 0.00007 |
1 | 50 | 0.05 | 0.949988 | 0.046511 | 0.00323 | 0.00027 |
2 | 500 | 0.05 | 0.950659 | 0.045651 | 0.00330 | 0.00039 |
3 | 5000 | 0.05 | 0.949107 | 0.047523 | 0.00308 | 0.00029 |
4 | 50000 | 0.05 | 0.950880 | 0.045400 | 0.00342 | 0.00030 |
5 rows × 6 columns
plot_results(vary_m_results, 'm')