# workspace setup import numpy as np import scipy as sp import scipy.io import scipy.misc import scipy.special %matplotlib inline import matplotlib.pylab as plt import itertools as it import math def discrete_sample(pmf): coin = np.random.random() acc = 0.0 n, = pmf.shape for i in xrange(n): acc0 = acc + pmf[i] if coin <= acc0: return i acc = acc0 return n-1 def gibbs_beta_bernoulli(Y, K, alpha, beta, gamma, niters): N, D = Y.shape alpha, beta, gamma = map(float, [alpha, beta, gamma]) # start with random assignment assignments = np.random.randint(0, K, size=N) # initialize the sufficient statistics (cluster sums) accordingly sums = np.zeros((D, K), dtype=np.int64) cnts = np.zeros(K, dtype=np.int64) for yi, ci in zip(Y, assignments): sums[:,ci] += yi cnts[ci] += 1 history = np.zeros((niters, N), dtype=np.int64) # precomputations nplog = np.log npexp = np.exp nparray = np.array logsumexp = sp.misc.logsumexp lg_denom = nplog(N - 1 + alpha) alpha_over_K = alpha/K beta_plus_gamma = beta + gamma for t in xrange(niters): for i, (yi, ci) in enumerate(zip(Y, assignments)): # remove from SS sums[:,ci] -= yi cnts[ci] -= 1 lg_term1 = nplog(cnts + alpha_over_K) - lg_denom - D*nplog(beta_plus_gamma + cnts) lg_term2 = nplog(beta + sums) lg_term3 = nplog(gamma + cnts - sums) lg_dist = lg_term1 + (lg_term2*yi[:,np.newaxis] + lg_term3*(1-yi[:,np.newaxis])).sum(axis=0) lg_dist -= logsumexp(lg_dist) # normalize # reassign ci = discrete_sample(npexp(lg_dist)) assignments[i] = ci sums[:,ci] += yi cnts[ci] += 1 history[t] = assignments return history alpha, beta, gamma = 2., 1., 1. K = 2 D = 3 N = 4 pis = np.random.dirichlet(alpha/K*np.ones(K)) cis = np.array([discrete_sample(pis) for _ in xrange(N)]) aks = np.random.beta(beta, gamma, size=(K, D)) print 'Pi:', pis print 'C :', cis def bernoulli(p): return 1 if np.random.random() <= p else 0 Y = np.zeros((N, D), dtype=np.int64) for i in xrange(N): Y[i] = np.array([bernoulli(aks[cis[i], d]) for d in xrange(D)]) niters = 50000 chain = gibbs_beta_bernoulli(Y, K, alpha, beta, gamma, niters) def all_assignment_vectors(): return it.product(range(K), repeat=N) def lg_pr_joint(C, Y, K, alpha, beta, gamma): N, D = Y.shape nks = np.bincount(C, minlength=K) assert nks.shape[0] == K assert C.shape[0] == N # log P(C) gammaln = sp.special.gammaln betaln = sp.special.betaln term1 = gammaln(alpha) - gammaln(N + alpha) - K*gammaln(alpha/K) term2 = sum(gammaln(nk + alpha/K) for nk in nks) lg_pC = term1 + term2 # log P(Y|C) term1 = K*D*betaln(beta, gamma) term2 = D*sum(gammaln(nk + beta + gamma) for nk in nks) sums = np.zeros((K, D)) for yi, ci in zip(Y, C): sums[ci] += yi def fn1(nk, sum_yid): assert nk >= sum_yid return gammaln(sum_yid + beta) + gammaln(nk - sum_yid + gamma) term3 = sum(sum(fn1(nk, yid) for yid in row) for nk, row in zip(nks, sums)) lg_pYgC = -term1 - term2 + term3 return lg_pC + lg_pYgC def brute_force_posterior(Y, K, alpha, beta, gamma): N, _ = Y.shape # enumerate K^N cluster assignments lg_pis = np.array([lg_pr_joint(np.array(C), Y, K, alpha, beta, gamma) for C in all_assignment_vectors()]) lg_pis -= sp.misc.logsumexp(lg_pis) return np.exp(lg_pis) # generate an ID for each K^N element idmap = { C : i for i, C in enumerate(all_assignment_vectors()) } revidmap = { i : C for i, C in enumerate(all_assignment_vectors()) } actual_posterior = brute_force_posterior(Y, K, alpha, beta, gamma) print 'P(C=actual|Y):', actual_posterior[idmap[tuple(cis)]] print 'max_C P(C|Y):', actual_posterior.max() print 'argmax_C P(C|Y):', revidmap[actual_posterior.argmax()] smoothing = 1e-5 skip = 100 skipped_chain = chain[::skip] window = 10000 def kl(a, b): return np.sum([p*np.log(p/q) for p, q in zip(a, b)]) def histify(history, K): _, N = history.shape hist = np.zeros(K**N, dtype=np.float) for h in history: hist[idmap[tuple(h)]] += 1.0 return hist def fn(i): hist = histify(skipped_chain[max(0, i-window):(i+1)], K) + smoothing hist /= hist.sum() return kl(actual_posterior, hist) kls = map(fn, xrange(skipped_chain.shape[0])) H = histify(skipped_chain[skipped_chain.shape[0]-1-window:], K) + smoothing H /= H.sum() print "C\t\tP_g(C|Y)\tP_a(C|Y)\t|Diff|" for c, (a, b) in zip(all_assignment_vectors(), zip(H, actual_posterior)): def f(x): return '%.7f' % (x) print "\t".join(map(str, [c,f(a),f(b),f(math.fabs(a-b))])) plt.plot(np.arange(0, chain.shape[0], skip) + 1, kls) plt.xlabel('Iterations') plt.ylabel('Posterior KL-divergence') def posterior_predictive1(C, Y, K, alpha, beta, gamma): N, D = Y.shape assert C.shape[0] == N nks = np.bincount(C, minlength=K) sums = np.zeros((D, K), dtype=np.int64) for yi, ci in zip(Y, C): sums[:,ci] += yi lg_term1 = np.log(nks + alpha/K) - np.log(N + alpha) def posterior_predictive(C, Y, K, alpha, beta, gamma): N, D = Y.shape assert C.shape[0] == N nks = np.bincount(C, minlength=K) sums = np.zeros((K, D), dtype=np.int64) for yi, ci in zip(Y, C): sums[ci] += yi def fn(yvalue): def fn1(nk, sum_yid, yd): assert nk >= sum_yid theta = (beta + sum_yid) / (beta + gamma + nk) assert theta >= 0.0 and theta <= 1.0 return np.log(theta) if yd else np.log(1.-theta) def fn2(nk, row): assert len(yvalue) == row.shape[0] term1 = np.log(nk + alpha/K) - np.log(N + alpha) term2 = sum(fn1(nk, sum_yid, yd) for sum_yid, yd in zip(row, yvalue)) return term1 + term2 return sp.misc.logsumexp([fn2(nk, row) for nk, row in zip(nks, sums)]) yvalues = it.product([0, 1], repeat=D) lg_pr_yvalue = map(fn, yvalues) return np.exp(lg_pr_yvalue) def posterior_predictive_nonbayes(pis, aks): K, = pis.shape assert aks.shape[0] == K _, D = aks.shape def fn(yvalue): def fn2(yd, theta_kd): return np.log(theta_kd) if yd else np.log(1.-theta_kd) def fn1(pi_k, theta_k): return np.log(pi_k) + sum(fn2(yd, theta_kd) for yd, theta_kd in zip(yvalue, theta_k)) return sp.misc.logsumexp([fn1(pi_k, theta_k) for pi_k, theta_k in zip(pis, aks)]) yvalues = it.product([0, 1], repeat=D) lg_pr_yvalue = map(fn, yvalues) return np.exp(lg_pr_yvalue) # reference distributions actual_posterior_predictive = posterior_predictive(cis, Y, K, alpha, beta, gamma) nonbayes_posterior_predictive = posterior_predictive_nonbayes(pis, aks) posterior_predictives = np.array([ posterior_predictive(assignment, Y, K, alpha, beta, gamma) for assignment in skipped_chain]) def fn1(i): posteriors = posterior_predictives[min(0, i-window):(i+1)].mean(axis=0) return kl(actual_posterior_predictive, posteriors) posterior_predictive_kls = map(fn1, xrange(posterior_predictives.shape[0])) PH = posterior_predictives[posterior_predictives.shape[0]-1-window:].mean(axis=0) print "y\t\tP(y|C) [gibbs]\tP(y|C) [actual]\tP(y|params)\t|gibbs-actual|" for y, ((a, b), c) in zip(it.product([0,1],repeat=D), zip(zip(PH, actual_posterior_predictive), nonbayes_posterior_predictive)): print "\t".join(map(str, [y, a, b, c, math.fabs(a-b)])) plt.plot(np.arange(0, niters, skip) + 1, posterior_predictive_kls) plt.xlabel('Iterations') plt.ylabel('Posterior predictive KL-divergence') usps_digits = sp.io.loadmat('data/usps_resampled/usps_resampled.mat') usps_class_2 = usps_digits['train_labels'][2,:] == 1 usps_Y0 = usps_digits['train_patterns'][:,usps_class_2] usps_Y = np.zeros(usps_Y0.shape) usps_Y[usps_Y0>0.1] = 1.0 # make the image binary plt.imshow(usps_Y[:,3].reshape((16,16)), cmap=plt.cm.binary) usps_Y = np.array(usps_Y.T, dtype=np.int64) # convert dataset to our dimension # USPS priors usps_alpha, usps_beta, usps_gamma = 50., .5, .5 usps_K = 40 usps_N, usps_D = usps_Y.shape usps_niters = 1000 print usps_N, usps_D usps_chain = gibbs_beta_bernoulli(usps_Y, usps_K, usps_alpha, usps_beta, usps_gamma, usps_niters) def posterior_thetas(C, Y, K, beta, gamma): N, D = Y.shape assert C.shape[0] == N thetas = np.zeros((K, D)) for k in xrange(K): thetas[k] = (beta + Y[C==k].sum(axis=0)) / (beta + gamma + N) return thetas usps_chain = usps_chain[::skip] pts = np.array([posterior_thetas(c, usps_Y, usps_K, usps_beta, usps_gamma) for c in usps_chain]) pts = pts.mean(axis=0) digits_per_row = 5 assert not (pt.shape[0] % digits_per_row) img = np.vstack([ np.hstack([ p.reshape((16, 16)) for p in pt[i:(i+digits_per_row)]]) for i in xrange(0, pt.shape[0], digits_per_row)]) plt.rcParams['figure.figsize'] = (10.0, 8.0) plt.imshow(img, cmap=plt.cm.binary) plt.hist(usps_chain[-1], bins=40)