!date import pymc as mc import random def simulate(n, p, seed): random.seed(123456+seed) mc.np.random.seed(123456+seed) # make A clusters, beta distributed A = 5 X_true = mc.rbeta(.5, .5, size=(A,p)) y_true = mc.rbeta(1, 1, size=A) X = zeros((n,p)) p_true = zeros(n) for i in range(n): a_i = random.randrange(A) X[i] = mc.rbernoulli(X_true[a_i]) p_true[i] = y_true[a_i] y = mc.rbinomial(1, p_true) test = random.sample(range(n), n/4) train = list(set(range(n)) - set(test)) X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] return locals() n = 1000 p = 10 data = simulate(n=n, p=p, seed=0) import sklearn.naive_bayes nb = sklearn.naive_bayes.BernoulliNB(alpha=0, fit_prior=False) nb.fit(data['X_train'], data['y_train']) y_pred = nb.predict_proba(data['X_test']) # accuracy mean((y_pred[:,1] >= .5) == data['y_test']) n_test = len(data['X_test']) y = [mc.Bernoulli('y_%d'%i, .5) for i in range(n_test)] alpha = empty(p) beta = empty(p) for j in range(p): # alpha[j] is Pr[X_j = 1 | y = 1] in training data alpha[j] = (data['X_train'][:,j] * data['y_train']).sum() / data['y_train'].sum() # beta[j] is Pr[X_j = 1 | y = 0] in training data beta[j] = (data['X_train'][:,j] * (1-data['y_train'])).sum() / (1-data['y_train']).sum() X = [mc.Bernoulli('X_%d_%d'%(i,j), alpha[j]*y[i]+beta[j]*(1-y[i]), value=data['X_test'][i,j], observed=True) for i in range(n_test) for j in range(p)] mcmc = mc.MCMC([y, X]) %time mcmc.sample(10000) plot([y_i.stats()['mean'] for y_i in y], y_pred[:,1], 's', mec='k', color='none') plot([0,1], [0,1], 'k--')