import pandas as pd
import numpy as np
import patsy as pt
import statsmodels.api as sm
from statsmodels.sandbox.stats.multicomp import multipletests

np.random.seed(1010093)

pValues = []

for i in xrange(1000):
    x = np.random.normal(size=20)
    y = np.random.normal(size=20)
    y, x = pt.dmatrices('y ~ x')

    pValues.append(sm.OLS(y, x).fit().pvalues[1])
    
pValues = np.array(pValues)

sum(pValues < .05)

_, p_adjust, _, _ = multipletests(pValues, method='bonferroni')
sum(p_adjust < .05)

_, p_adjust, _, _ = multipletests(pValues, method='fdr_bh')
sum(p_adjust < .05)

np.random.seed(1010093)

pValues = []

for i in xrange(1000):
    x = np.random.normal(size=20)
    
    # first 500 beta = 0, last 500 beta = 2
    if i < 500:
        y = np.random.normal(size=20)
    else:
        y = np.random.normal(loc=2*x, size=20)

    y, x = pt.dmatrices('y ~ x')
    
    pValues.append(sm.OLS(y, x).fit().pvalues[1])
    
pValues = np.array(pValues)

trueStatus = np.concatenate([np.repeat('zero', 500), np.repeat('not zero', 500)])
pd.crosstab(pValues < .05, trueStatus)

_, p_adjust, _, _ = multipletests(pValues, method='bonferroni')
pd.crosstab(p_adjust < .05, trueStatus)

plot(pValues, p_adjust, 'ok')
xlim(-.05, 1.05)
ylim(-.05, 1.05)
xlabel('pValues')
ylabel('p_adjust');

_, p_adjust, _, _ = multipletests(pValues, method='fdr_bh')
pd.crosstab(p_adjust < .05, trueStatus)

plot(pValues, p_adjust, 'ok')
xlim(-.05, 1.05)
ylim(-.05, 1.05)
xlabel('pValues')
ylabel('p_adjust');