# workspace setup
import numpy as np
import scipy as sp
import scipy.io
import scipy.misc
import scipy.special

%matplotlib inline
import matplotlib.pylab as plt

import itertools as it
import math

def discrete_sample(pmf):
    coin = np.random.random()
    acc = 0.0
    n, = pmf.shape
    for i in xrange(n):
        acc0 = acc + pmf[i]
        if coin <= acc0:
            return i
        acc = acc0
    return n-1

def gibbs_beta_bernoulli(Y, K, alpha, beta, gamma, niters):
    N, D = Y.shape
    alpha, beta, gamma = map(float, [alpha, beta, gamma])

    # start with random assignment
    assignments = np.random.randint(0, K, size=N)

    # initialize the sufficient statistics (cluster sums) accordingly
    sums = np.zeros((D, K), dtype=np.int64)
    cnts = np.zeros(K, dtype=np.int64)
    for yi, ci in zip(Y, assignments):
        sums[:,ci] += yi
        cnts[ci] += 1

    history = np.zeros((niters, N), dtype=np.int64)

    # precomputations
    nplog = np.log
    npexp = np.exp
    nparray = np.array
    logsumexp = sp.misc.logsumexp
    lg_denom = nplog(N - 1 + alpha)
    alpha_over_K = alpha/K
    beta_plus_gamma = beta + gamma

    for t in xrange(niters):
        for i, (yi, ci) in enumerate(zip(Y, assignments)):
            # remove from SS
            sums[:,ci] -= yi
            cnts[ci] -= 1

            lg_term1 = nplog(cnts + alpha_over_K) - lg_denom - D*nplog(beta_plus_gamma + cnts)
            lg_term2 = nplog(beta + sums)
            lg_term3 = nplog(gamma + cnts - sums)

            lg_dist = lg_term1 + (lg_term2*yi[:,np.newaxis] + lg_term3*(1-yi[:,np.newaxis])).sum(axis=0)
            lg_dist -= logsumexp(lg_dist) # normalize

            # reassign
            ci = discrete_sample(npexp(lg_dist))
            assignments[i] = ci
            sums[:,ci] += yi
            cnts[ci] += 1
        history[t] = assignments

    return history

alpha, beta, gamma = 2., 1., 1.
K = 2
D = 3
N = 4

pis = np.random.dirichlet(alpha/K*np.ones(K))
cis = np.array([discrete_sample(pis) for _ in xrange(N)])
aks = np.random.beta(beta, gamma, size=(K, D))

print 'Pi:', pis
print 'C :', cis

def bernoulli(p):
    return 1 if np.random.random() <= p else 0

Y = np.zeros((N, D), dtype=np.int64)
for i in xrange(N):
    Y[i] = np.array([bernoulli(aks[cis[i], d]) for d in xrange(D)])

niters = 50000
chain = gibbs_beta_bernoulli(Y, K, alpha, beta, gamma, niters)

def all_assignment_vectors():
    return it.product(range(K), repeat=N)

def lg_pr_joint(C, Y, K, alpha, beta, gamma):
    N, D = Y.shape
    nks = np.bincount(C, minlength=K)
    assert nks.shape[0] == K
    assert C.shape[0] == N

    # log P(C)
    gammaln = sp.special.gammaln
    betaln = sp.special.betaln
    term1 = gammaln(alpha) - gammaln(N + alpha) - K*gammaln(alpha/K)
    term2 = sum(gammaln(nk + alpha/K) for nk in nks)
    lg_pC = term1 + term2

    # log P(Y|C)
    term1 = K*D*betaln(beta, gamma)
    term2 = D*sum(gammaln(nk + beta + gamma) for nk in nks)
    sums = np.zeros((K, D))
    for yi, ci in zip(Y, C):
        sums[ci] += yi
    def fn1(nk, sum_yid):
        assert nk >= sum_yid
        return gammaln(sum_yid + beta) + gammaln(nk - sum_yid + gamma)
    term3 = sum(sum(fn1(nk, yid) for yid in row) for nk, row in zip(nks, sums))
    lg_pYgC = -term1 - term2 + term3
    
    return lg_pC + lg_pYgC

def brute_force_posterior(Y, K, alpha, beta, gamma):
    N, _ = Y.shape

    # enumerate K^N cluster assignments
    lg_pis = np.array([lg_pr_joint(np.array(C), Y, K, alpha, beta, gamma) for C in all_assignment_vectors()])
    lg_pis -= sp.misc.logsumexp(lg_pis)
    
    return np.exp(lg_pis)

# generate an ID for each K^N element
idmap = { C : i for i, C in enumerate(all_assignment_vectors()) }
revidmap = { i : C for i, C in enumerate(all_assignment_vectors()) }

actual_posterior = brute_force_posterior(Y, K, alpha, beta, gamma)

print 'P(C=actual|Y):', actual_posterior[idmap[tuple(cis)]]
print 'max_C P(C|Y):', actual_posterior.max()
print 'argmax_C P(C|Y):', revidmap[actual_posterior.argmax()]

smoothing = 1e-5
skip = 100
skipped_chain = chain[::skip]
window = 10000

def kl(a, b):
    return np.sum([p*np.log(p/q) for p, q in zip(a, b)])

def histify(history, K):
    _, N = history.shape
    hist = np.zeros(K**N, dtype=np.float)
    for h in history:
        hist[idmap[tuple(h)]] += 1.0
    return hist

def fn(i):
    hist = histify(skipped_chain[max(0, i-window):(i+1)], K) + smoothing
    hist /= hist.sum()
    return kl(actual_posterior, hist)

kls = map(fn, xrange(skipped_chain.shape[0]))

H = histify(skipped_chain[skipped_chain.shape[0]-1-window:], K) + smoothing
H /= H.sum()
print "C\t\tP_g(C|Y)\tP_a(C|Y)\t|Diff|"
for c, (a, b) in zip(all_assignment_vectors(), zip(H, actual_posterior)):
    def f(x): return '%.7f' % (x)
    print "\t".join(map(str, [c,f(a),f(b),f(math.fabs(a-b))]))

plt.plot(np.arange(0, chain.shape[0], skip) + 1, kls)
plt.xlabel('Iterations')
plt.ylabel('Posterior KL-divergence')

def posterior_predictive1(C, Y, K, alpha, beta, gamma):
    N, D = Y.shape
    assert C.shape[0] == N
    
    nks = np.bincount(C, minlength=K)
    sums = np.zeros((D, K), dtype=np.int64) 
    for yi, ci in zip(Y, C): 
        sums[:,ci] += yi
        
    lg_term1 = np.log(nks + alpha/K) - np.log(N + alpha)

def posterior_predictive(C, Y, K, alpha, beta, gamma):
    N, D = Y.shape
    assert C.shape[0] == N

    nks = np.bincount(C, minlength=K)
    sums = np.zeros((K, D), dtype=np.int64) 
    for yi, ci in zip(Y, C): 
        sums[ci] += yi

    def fn(yvalue):
        def fn1(nk, sum_yid, yd):
            assert nk >= sum_yid
            theta = (beta + sum_yid) / (beta + gamma + nk) 
            assert theta >= 0.0 and theta <= 1.0 
            return np.log(theta) if yd else np.log(1.-theta)
        def fn2(nk, row):
            assert len(yvalue) == row.shape[0]
            term1 = np.log(nk + alpha/K) - np.log(N + alpha)
            term2 = sum(fn1(nk, sum_yid, yd) for sum_yid, yd in zip(row, yvalue))
            return term1 + term2
        return sp.misc.logsumexp([fn2(nk, row) for nk, row in zip(nks, sums)])

    yvalues = it.product([0, 1], repeat=D)
    lg_pr_yvalue = map(fn, yvalues)
    return np.exp(lg_pr_yvalue)

def posterior_predictive_nonbayes(pis, aks):
    K, = pis.shape
    assert aks.shape[0] == K
    _, D = aks.shape
    
    def fn(yvalue):
        def fn2(yd, theta_kd):
            return np.log(theta_kd) if yd else np.log(1.-theta_kd)
        def fn1(pi_k, theta_k):
            return np.log(pi_k) + sum(fn2(yd, theta_kd) for yd, theta_kd in zip(yvalue, theta_k))
        return sp.misc.logsumexp([fn1(pi_k, theta_k) for pi_k, theta_k in zip(pis, aks)])
    
    yvalues = it.product([0, 1], repeat=D)
    lg_pr_yvalue = map(fn, yvalues)
    return np.exp(lg_pr_yvalue)

# reference distributions
actual_posterior_predictive = posterior_predictive(cis, Y, K, alpha, beta, gamma)
nonbayes_posterior_predictive = posterior_predictive_nonbayes(pis, aks)

posterior_predictives = np.array([
    posterior_predictive(assignment, Y, K, alpha, beta, gamma) for assignment in skipped_chain])
    
def fn1(i):
    posteriors = posterior_predictives[min(0, i-window):(i+1)].mean(axis=0)
    return kl(actual_posterior_predictive, posteriors)

posterior_predictive_kls = map(fn1, xrange(posterior_predictives.shape[0]))

PH = posterior_predictives[posterior_predictives.shape[0]-1-window:].mean(axis=0)
print "y\t\tP(y|C) [gibbs]\tP(y|C) [actual]\tP(y|params)\t|gibbs-actual|"
for y, ((a, b), c) in zip(it.product([0,1],repeat=D), zip(zip(PH, actual_posterior_predictive), nonbayes_posterior_predictive)):
    print "\t".join(map(str, [y, a, b, c, math.fabs(a-b)]))

plt.plot(np.arange(0, niters, skip) + 1, posterior_predictive_kls)
plt.xlabel('Iterations')
plt.ylabel('Posterior predictive KL-divergence')

usps_digits = sp.io.loadmat('data/usps_resampled/usps_resampled.mat')

usps_class_2 = usps_digits['train_labels'][2,:] == 1
usps_Y0 = usps_digits['train_patterns'][:,usps_class_2]
usps_Y = np.zeros(usps_Y0.shape)
usps_Y[usps_Y0>0.1] = 1.0 # make the image binary
plt.imshow(usps_Y[:,3].reshape((16,16)), cmap=plt.cm.binary)

usps_Y = np.array(usps_Y.T, dtype=np.int64) # convert dataset to our dimension

# USPS priors
usps_alpha, usps_beta, usps_gamma = 50., .5, .5
usps_K = 40
usps_N, usps_D = usps_Y.shape
usps_niters = 1000
print usps_N, usps_D

usps_chain = gibbs_beta_bernoulli(usps_Y, usps_K, usps_alpha, usps_beta, usps_gamma, usps_niters)

def posterior_thetas(C, Y, K, beta, gamma):
    N, D = Y.shape
    assert C.shape[0] == N
    thetas = np.zeros((K, D))
    for k in xrange(K):
        thetas[k] = (beta + Y[C==k].sum(axis=0)) / (beta + gamma + N)
    return thetas

usps_chain = usps_chain[::skip]
pts = np.array([posterior_thetas(c, usps_Y, usps_K, usps_beta, usps_gamma) for c in usps_chain])
pts = pts.mean(axis=0)

digits_per_row = 5
assert not (pt.shape[0] % digits_per_row)
img = np.vstack([
    np.hstack([
        p.reshape((16, 16)) for p in pt[i:(i+digits_per_row)]]) 
    for i in xrange(0, pt.shape[0], digits_per_row)]) 
plt.rcParams['figure.figsize'] = (10.0, 8.0)
plt.imshow(img, cmap=plt.cm.binary)

plt.hist(usps_chain[-1], bins=40)