import pandas as pd
import numpy as np
import patsy as pt
import statsmodels.api as sm
from statsmodels.sandbox.stats.multicomp import multipletests
Test
np.random.seed(1010093)
pValues = []
for i in xrange(1000):
x = np.random.normal(size=20)
y = np.random.normal(size=20)
y, x = pt.dmatrices('y ~ x')
pValues.append(sm.OLS(y, x).fit().pvalues[1])
pValues = np.array(pValues)
sum(pValues < .05)
54
_, p_adjust, _, _ = multipletests(pValues, method='bonferroni')
sum(p_adjust < .05)
0
_, p_adjust, _, _ = multipletests(pValues, method='fdr_bh')
sum(p_adjust < .05)
0
np.random.seed(1010093)
pValues = []
for i in xrange(1000):
x = np.random.normal(size=20)
# first 500 beta = 0, last 500 beta = 2
if i < 500:
y = np.random.normal(size=20)
else:
y = np.random.normal(loc=2*x, size=20)
y, x = pt.dmatrices('y ~ x')
pValues.append(sm.OLS(y, x).fit().pvalues[1])
pValues = np.array(pValues)
trueStatus = np.concatenate([np.repeat('zero', 500), np.repeat('not zero', 500)])
pd.crosstab(pValues < .05, trueStatus)
col_0 | not zero | zero |
---|---|---|
row_0 | ||
False | 0 | 470 |
True | 500 | 30 |
_, p_adjust, _, _ = multipletests(pValues, method='bonferroni')
pd.crosstab(p_adjust < .05, trueStatus)
col_0 | not zero | zero |
---|---|---|
row_0 | ||
False | 29 | 500 |
True | 471 | 0 |
plot(pValues, p_adjust, 'ok')
xlim(-.05, 1.05)
ylim(-.05, 1.05)
xlabel('pValues')
ylabel('p_adjust');
_, p_adjust, _, _ = multipletests(pValues, method='fdr_bh')
pd.crosstab(p_adjust < .05, trueStatus)
col_0 | not zero | zero |
---|---|---|
row_0 | ||
False | 0 | 483 |
True | 500 | 17 |
plot(pValues, p_adjust, 'ok')
xlim(-.05, 1.05)
ylim(-.05, 1.05)
xlabel('pValues')
ylabel('p_adjust');