Problem 1¶

In [1]:

import matplotlib.pyplot as plt
import numpy as np
import pandas

def get_num_false_discoveries(m, alpha):
    """Return the number of false discoveries when applying the Benjamini-Hochberg
    prescription with `alpha` to `m` uniformly distributed p-values."""
    pvals = np.random.random(m)
    pvals.sort()
    k = np.arange(1, len(pvals) + 1)
    thresholds = k * (alpha / m)
    return np.sum(pvals <= thresholds)

def estimate_probs(m=50, alpha=0.05, num_simulations=10**5):
    """Estimate the probability of getting N=0, 1, 2, and 3 false discoveries
    when applying the Benjamini-Hochberg prescription with `alpha` to `m` uniformly
    distributed p-values."""
    counts = np.zeros(4)
    for _ in xrange(num_simulations):
        num_discoveries = get_num_false_discoveries(m, alpha)
        if num_discoveries < len(counts):
            counts[num_discoveries] += 1

    probs = counts / counts.sum()
    return probs

for count, prob in enumerate(estimate_probs()):
    print 'P({0}) = {1}'.format(count, prob)

P(0) = 0.94991
P(1) = 0.04654
P(2) = 0.00327
P(3) = 0.00028

Problem 2¶

In [7]:

def collect_results(ms, alphas):
    """Collect probabilities of false discoveries for every combination of ms and alphas."""
    results = []
    for m in ms:
        for alpha in alphas:
            result = [m, alpha]
            result.extend(estimate_probs(m=m, alpha=alpha))
            results.append(result)

    return pandas.DataFrame(results, columns=['m', 'alpha', 'p0', 'p1', 'p2', 'p3'])

def plot_results(results, x_var, logx=True):
    fig, axs = plt.subplots(2, 2, figsize=(6, 6))
    for num_discoveries in xrange(4):
        row = num_discoveries // 2
        col = num_discoveries % 2
        x_axis = results[x_var]
        probs = results['p{0}'.format(num_discoveries)]
        axs[row][col].plot(x_axis, probs, marker='o')
        axs[row][col].set_xscale('log')
        axs[row][col].set_title('p{0}'.format(num_discoveries))

Collect and plot results for varying values of alpha.¶

In [3]:

vary_alpha_results = collect_results([50], [0.0001, 0.001, 0.01, 0.1, 0.5])
vary_alpha_results

Out[3]:

	m	alpha	p0	p1	p2	p3
0	50	0.0001	0.999910	0.000090	0.000000	0.000000
1	50	0.0010	0.999080	0.000920	0.000000	0.000000
2	50	0.0100	0.990490	0.009420	0.000090	0.000000
3	50	0.1000	0.898847	0.087688	0.011555	0.001911
4	50	0.5000	0.571245	0.232542	0.123220	0.072992

5 rows × 6 columns

In [8]:

plot_results(vary_alpha_results, 'alpha')

Collect and plot results for varying values of m.¶

In [5]:

vary_m_results = collect_results([5, 50, 500, 5000, 50000], [0.05])
vary_m_results

Out[5]:

	m	alpha	p0	p1	p2	p3
0	5	0.05	0.949959	0.047080	0.00289	0.00007
1	50	0.05	0.949988	0.046511	0.00323	0.00027
2	500	0.05	0.950659	0.045651	0.00330	0.00039
3	5000	0.05	0.949107	0.047523	0.00308	0.00029
4	50000	0.05	0.950880	0.045400	0.00342	0.00030

5 rows × 6 columns

In [10]:

plot_results(vary_m_results, 'm')