import numpy as np import matplotlib.pyplot as plt def data_to_counts(data): ''' Convert the data to an array of counts of the numbers of occurrences of each outcome in each row. ''' counts = [np.bincount(row, minlength=10) for row in data] return np.asarray(counts) def draw_assignments(counts, ps, qs): ''' Compute the conditional distributions of the assignemnts of each row and then draw from these distributions. ''' p_likelihoods = np.product(np.power(ps, counts), axis=1) q_likelihoods = np.product(np.power(qs, counts), axis=1) assignments_probs = p_likelihoods / (p_likelihoods + q_likelihoods) assignments = np.random.uniform(size=len(assignments_probs)) < assignments_probs return np.asarray(assignments) def draw_probabilities(counts, assignments): ''' Compute the total counts in all rows assigned to each type and draw new sets of rates for each type given these counts. ''' ps_rows = counts[np.where(assignments == True)] ps_counts = ps_rows.sum(axis=0) ps = np.random.dirichlet(ps_counts) qs_rows = counts[np.where(assignments == False)] qs_counts = qs_rows.sum(axis=0) qs = np.random.dirichlet(qs_counts) return ps, qs num_cycles = 10000 data = np.loadtxt('data/gibbs_data.txt', dtype=int) num_rows, num_outcomes = data.shape counts = data_to_counts(data) ps_history = np.empty((num_cycles, num_outcomes)) qs_history = np.empty((num_cycles, num_outcomes)) assignments_history = np.empty((num_cycles, num_rows), dtype=bool) # Draw sensible starting values. ps = np.random.dirichlet(np.ones(num_outcomes) * counts.sum() / 2) qs = np.random.dirichlet(np.ones(num_outcomes) * counts.sum() / 2) for i in xrange(num_cycles): assignments = draw_assignments(counts, ps, qs) ps, qs = draw_probabilities(counts, assignments) assignments_history[i] = assignments ps_history[i] = ps qs_history[i] = qs fig, (ps_ax, qs_ax) = plt.subplots(1, 2, figsize=(16, 8)) components_to_plot = 2 for label, ax, values in [('p', ps_ax, ps_history), ('q', qs_ax, qs_history)]: for j in range(components_to_plot): ax.hist(values[:, j], bins=100, histtype='step', label='{0}[{1}]'.format(label, j)) ax.set_title('Posterior distribution of {0}'.format(label)) ax.legend() np.mean(ps_history, axis=0) np.mean(qs_history, axis=0) np.mean(assignments_history, axis=0)