Vishal Goklani
twitter.com/vgoklani
Solutions from the Kaggle Dark Worlds competition
from IPython.display import display
%load_ext retina
%load_ext autosave
%autosave 90

%pylab inline

import os, sys, re, json, itertools, operator
import pandas as pd, numpy as np, scipy as sp
from matplotlib.patches import Ellipse, Circle
from joblib import Memory

%load_ext cythonmagic

data_directory = os.path.abspath(os.path.join(os.path.expanduser("~"), 'build', 'kaggle', 'DarkWorlds'))
output_directory = os.path.join(data_directory, 'output')

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

joblib_cache = Memory(cachedir=os.path.join(output_directory, 'joblib'), verbose=0)

pd.set_printoptions(notebook_repr_html=True, precision=4, max_columns=200, max_rows=50, max_colwidth=25, column_space=10)
np.set_printoptions(linewidth=200, precision=15, suppress=True)

%%cython
import numpy as np

cimport numpy as np
#from libc.math cimport sqrt
cimport cython

ctypedef np.float64_t dtype_t

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_distance(np.ndarray[dtype_t, ndim=1] x0, np.ndarray[dtype_t, ndim=1] y0, np.ndarray[dtype_t, ndim=1] x1, np.ndarray[dtype_t, ndim=1] y1, dtype_t p=2.0):
	return ( (x0-x1)**p + (y0-y1)**p )**(1.0/p)

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef check_distance(np.ndarray[dtype_t, ndim=1] x0, np.ndarray[dtype_t, ndim=1] y0, dtype_t x1, dtype_t y1, dtype_t r):
	return ( ( (x0-x1)**2 + (y0-y1)**2 )**(0.5) <= r )

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, np.ndarray[dtype_t, ndim=1] x_halo, np.ndarray[dtype_t, ndim=1] y_halo):
	if x_halo.all() == 0.0 and y_halo.all() == 0.0:
		return 0
	cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
	return (-( e1*np.cos(2*phi) + e2*np.sin(2*phi) ))

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal_point(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, dtype_t x_halo, dtype_t y_halo):
	cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
	return (-( e1*np.cos(2*phi) + e2*np.sin(2*phi) )).mean()

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal_intensity(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, dtype_t x_halo, dtype_t y_halo, dtype_t p=2):
	cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
	cdef np.ndarray[dtype_t, ndim=1] signal = -( e1*np.cos(2*phi) + e2*np.sin(2*phi))
	cdef np.ndarray[dtype_t, ndim=1] distance = ( (x-x_halo)**2 + (y-y_halo)**2 )**(0.5)
	return (signal/(distance**0)).mean()

# @todo: rewrite this abomination...
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef get_max(np.ndarray[dtype_t, ndim=2] X, dtype_t r1=15, Py_ssize_t max_no_galaxies = 3, dtype_t r2=15):
	cdef:
		dtype_t curr_max1 = 0.0, curr_max2 = 0.0, curr_max3 = 0.0
		Py_ssize_t i, i_max1, i_max2, i_max3
		Py_ssize_t j, j_max1, j_max2, j_max3
	for i in xrange(X.shape[0]):
		for j in xrange(X.shape[1]):
			if X[i,j] > curr_max1:
				curr_max1 = X[i,j]
				i_max1 = i
				j_max1 = j
			elif max_no_galaxies > 1 and X[i,j] > curr_max2 and (((i_max1 - i)**2 + (j_max1 - j)**2) > r1**2):
				curr_max2 = X[i,j]
				i_max2 = i
				j_max2 = j
			elif max_no_galaxies > 2 and X[i,j] > curr_max3 and (((i_max1 - i)**2 + (j_max1 - j)**2) > r1**2) and (((i_max2 - i)**2 + (j_max2 - j)**2) > r2**2):
				curr_max3 = X[i,j]
				i_max3 = i
				j_max3 = j
	return [(i_max1, j_max1, curr_max1), (i_max2, j_max2, curr_max2), (i_max3, j_max3, curr_max3)][:max_no_galaxies]

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_strength_(np.ndarray[dtype_t, ndim=2] M, dtype_t r=5000):
	cdef:
		np.ndarray[dtype_t, ndim=3] s = np.zeros((300, 42, 42), dtype=np.float64)
		np.ndarray[dtype_t, ndim=2] w
		Py_ssize_t x0,y0, skyid
	for skyid in xrange(300):
		X = M[ M[:,0] == skyid ]
		for x0 in xrange(0,4200,100):
			for y0 in xrange(0,4200,100):
				w = X[check_distance( X[:,1], X[:,2], x0, y0, r)]
				if w.shape[0] > 0:
					s[skyid, int(x0/100.0), int(y0/100.0)] = calc_signal_intensity(w[:,1], w[:,2], w[:,3], w[:,4], x0, y0)
	return s.reshape( (300, -1) ) # (# of samples, # of features)

%%cython
import numpy as np

cimport numpy as np
from libc.math cimport sqrt
cimport cython

ctypedef np.float64_t dtype_t

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef generate_halos(np.int number_of_halos=1):
	cdef:
		np.ndarray[dtype_t, ndim=2] halos = np.zeros( (3, 5), dtype=np.float64)
	for i in xrange(number_of_halos):
		halos[i, 0] = np.random.random_sample() * 4200.0 # x
		halos[i, 1] = np.random.random_sample() * 4200.0 # y
		halos[i, 2] = np.random.random_sample() * 1000.0 # r0
		halos[i, 3] = np.random.random_sample() * 200.0  # m
		halos[i, 4] = np.random.random_sample() * 0.0    # std
	return halos

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef init(np.ndarray[dtype_t, ndim=2] data):
	cdef:
		np.int trial, no_halos
		np.ndarray[dtype_t, ndim=3] best_halos = np.zeros((300, 3, 5), dtype=np.float64)
		np.ndarray[dtype_t, ndim=2] log_likelihoods = np.zeros( (300, 2), dtype=np.float64)
		np.ndarray[dtype_t, ndim=2] X, halos
		dtype_t log_likelihood
		Py_ssize_t skyid
	
	for skyid in xrange(data[:,0].min(), data[:,0].max()+1):
		X = data[ data[:,0] == skyid ]	# fancy indexing -> calls copy constructor :(
		
		log_likelihoods[skyid, 0] = -np.float('inf')	# initialize log_likelihood to negative infinity
		log_likelihoods[skyid, 1] = 0	# initialize # of acceptances
		no_halos = np.int(X[skyid, 5])
		for trial in xrange(5000):
			halos = generate_halos( no_halos )
			log_likelihood = calc_log_likelihood(X, halos)
			
			if log_likelihood > log_likelihoods[skyid, 0]:
				log_likelihoods[skyid, 1] += 1
				best_halos[skyid] = halos
				log_likelihoods[skyid, 0] = log_likelihood
		#print 'finished initializing sky#%s -> %s accepted with loglikelihood=%s' % (str(skyid+1), str(log_likelihoods[skyid, 1]), str(log_likelihoods[skyid, 0]))
	return best_halos, log_likelihoods

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_log_likelihood(np.ndarray[dtype_t, ndim=2] X, np.ndarray[dtype_t, ndim=2] halos):
	cdef:
		dtype_t halo_predicted_x, halo_predicted_y, halo_predicted_r_cutoff, halo_predicted_r_mass, halo_predicted_std, e_std
		np.ndarray[dtype_t, ndim=1] predicted_var, e1_force, e2_force, phi, dist_halo1, r_effective, force,  
		Py_ssize_t i, j, k, N
		np.int number_halos = np.int(X[0,5])
	e_std = 0.22
	e1_force = np.zeros_like(X[:,1])
	e2_force = np.zeros_like(X[:,1])
	
    # model parameter constraints -> shamelessly stolen from Iain Murray
	if (np.any(halos[:,0] < 0) or np.any(halos[:,0] > 4200) or 
		np.any(halos[:,1] < 0) or np.any(halos[:,1] > 4200) or
		np.any(halos[:,2] < 0) or np.any(halos[:,2] > 1000) or
		np.any(halos[:,3] < 0) or np.any(halos[:,3] > 300)  or
		np.any(halos[:,4] < 0) or np.any(halos[:,4] > 0.3)):
		return -np.float('inf')

	for i in xrange(number_halos):
		halo = halos[i,:]

		halo_predicted_x = halo[0]
		halo_predicted_y = halo[1]
		halo_predicted_r_cutoff = halo[2]
		halo_predicted_mass = halo[3]
		halo_predicted_std = e_std + halo[4]
		
		predicted_var = (e_std * e_std) * np.ones_like(X[:,1])

		phi = np.arctan( (X[:,2] - halo_predicted_y) / (X[:,1] - halo_predicted_x) )
		dist_halo1 = ( (X[:,1] - halo_predicted_x)**2 + (X[:,2] - halo_predicted_y)**2 )**(0.5)
		r_effective = np.maximum(halo_predicted_r_cutoff, dist_halo1)
		force = halo_predicted_mass/(r_effective*1.0)
		
		e1_force += -force*np.cos(2*phi)
		e2_force += -force*np.sin(2*phi)

		predicted_var[dist_halo1 < halo_predicted_r_cutoff] = np.maximum( predicted_var[dist_halo1 < halo_predicted_r_cutoff], halo_predicted_std * halo_predicted_std)

	log_likelihood = np.sum( -0.5 * ( ((X[:,3]-e1_force)**2 + (X[:,4]-e2_force)**2 ) / (predicted_var)) - np.log(2*np.pi*predicted_var) )
	
	return log_likelihood

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef mcmc(np.ndarray[dtype_t, ndim=2] data, np.ndarray[dtype_t, ndim=3] best_halos, np.ndarray[dtype_t, ndim=2] log_likelihoods, np.int no_iterations, dtype_t stepsize):
	cdef:
		np.ndarray[dtype_t, ndim=2] proposed_halos = np.empty_like(best_halos[0])
		np.ndarray[dtype_t, ndim=4] samples = np.zeros( (300, no_iterations, 3, 5), dtype=np.float64 )
		np.ndarray[dtype_t, ndim=2] X
		np.ndarray[dtype_t, ndim=2] perturbation = np.zeros( (3,5), dtype=np.float64)
		dtype_t log_likelihood_proposed
		np.int number_halos, iteration
		Py_ssize_t skyid
	
	for skyid in xrange(data[:,0].min(), data[:,0].max()+1):
		X = data[ data[:,0] == skyid ]
		number_halos = np.int(X[0,5])
		log_likelihoods[skyid, 1] = 0
	
		for iteration in xrange(no_iterations): # stepsize should be different for each variable type!
			perturbation[:number_halos] = stepsize*np.random.randn(number_halos,5)
			proposed_halos = perturbation + best_halos[skyid]
			log_likelihood_proposed = calc_log_likelihood(X, proposed_halos)
			if np.log( np.random.random_sample() ) < (log_likelihood_proposed - log_likelihoods[skyid, 0]):
				log_likelihoods[skyid, 1] += 1
				log_likelihoods[skyid, 0] = log_likelihood_proposed
				best_halos[skyid] = proposed_halos
			samples[skyid][iteration] = proposed_halos
		#print 'finished processing skyid%s' % str(skyid+1)
		#print '\tacceptance ratio = %s / %s\tlog_likelihood = %s' % (str(log_likelihoods[skyid, 1]), str(no_iterations), str(log_likelihoods[skyid, 0]) )

	return best_halos, samples

@joblib_cache.cache
def load_data():
	train_halos = pd.read_csv( os.path.join(data_directory, 'Training_halos.csv') )
	test_halo_counts = pd.read_csv( os.path.join(data_directory, 'Test_haloCounts.csv') )

	train_skies = pd.DataFrame()
	for s in os.listdir( os.path.join(data_directory, 'Train_Skies') ):
		try:
			d = pd.read_csv( os.path.join(data_directory, 'Train_Skies', s) )
			d['SkyId'] = s.split("_")[1].split(".")[0]
			d['ID'] = d['SkyId'].map(lambda x: int( str(x).split('y')[1]) )-1
			d['a'] = 1/(1-np.sqrt(d['e1']**2 + d['e2']**2))
			d['b'] = 1/(1+np.sqrt(d['e1']**2 + d['e2']**2))
			d['theta'] = np.degrees(np.arctan2(d['e2'], d['e1'])*0.5)
			d['e1_norm'] = (d.e1-d.e1.mean()) / d.e1.std()
			d['e2_norm'] = (d.e2-d.e2.mean()) / d.e2.std()
			train_skies = train_skies.append(d)
		except Exception as e:
			print s,' -> ', e.message

	test_skies = pd.DataFrame()
	for s in os.listdir( os.path.join(data_directory, 'Test_Skies') ):
		try:
			d = pd.read_csv( os.path.join(data_directory, 'Test_Skies', s) )
			d['SkyId'] = s.split("_")[1].split(".")[0]
			d['ID'] = d['SkyId'].map(lambda x: int( str(x).split('y')[1]) )-1
			d['a'] = 1/(1-np.sqrt(d['e1']**2 + d['e2']**2))
			d['b'] = 1/(1+np.sqrt(d['e1']**2 + d['e2']**2))
			d['theta'] = np.degrees(np.arctan2(d['e2'], d['e1'])*0.5)
			d['e1_norm'] = (d.e1-d.e1.mean()) / d.e1.std()
			d['e2_norm'] = (d.e2-d.e2.mean()) / d.e2.std()
			test_skies = test_skies.append(d)
		except Exception as e:
			print s,' -> ', e.message

	df = pd.merge(train_skies, train_halos, on='SkyId')
	df_test = pd.merge(test_skies, test_halo_counts, on='SkyId')

	return df, df_test, train_halos

@joblib_cache.cache
def calc_strength(X, r):
    return calc_strength_(X,r)

def draw_sky(X, train_halos, calculated_halos = None, size_multiplier=25, filename=None):
	fig = plt.figure(figsize=(10,10))
	ax = fig.add_subplot(111, aspect='equal')
	for row in xrange(X.index.min().values, X.index.max().values):
		try:
			x,y,a,b,theta = X.ix[row][ ['x','y','a','b','theta'] ].values
			galaxy = Ellipse(xy=(x, y), width=size_multiplier*a, height=size_multiplier*b, angle=theta, fc='none')
			ax.add_patch(galaxy)
		except:
			print row
	for i in range(1,1+train_halos['numberHalos'].values):
		x,y = train_halos['halo_x'+str(i)].values, train_halos['halo_y'+str(i)].values
		print x,y
		ax.add_patch(Circle(xy=(x, y), fc='r', ec='none', alpha=0.5, radius=3*size_multiplier))
	ax.autoscale_view(tight=True)
	plt.show()
	if filename is not None:
		fig.savefig(filename+'.pdf', format='pdf')#, bbox_inches='tight')

# the heatmap below shows the strength of the signal intensity
# due to the presence of a halo. 
def draw_heatmap(ID, df, train_halos, r=2000, filename=None):
	skyid='Sky'+str(ID+1)
	X = df[ ['ID','x','y','e1','e2','a','b','theta'] ].values
	strength = calc_strength(X, r=r)
	#print strength[np.isnan(strength)].shape
	s = strength.reshape( (300,42,42) )
	fig = plt.figure(figsize=(14,14))
	imgplot = imshow(asarray(s[ID].T), origin='lower')
	number_of_halos = train_halos[ train_halos.SkyId == skyid]['numberHalos'].values[0]

	imgplot.set_cmap('spectral')
	plt.colorbar()
	colors = itertools.cycle('rgbcmykw')

	for color, halo in zip(colors, get_max(s[ID], max_no_galaxies=number_of_halos)):
		print halo
		scatter(halo[0], halo[1], marker='o', s = 500, c=color, linewidths=2)

	th = train_halos[train_halos.SkyId == 'Sky'+str(ID+1)]
	for i in range(1,1+th['numberHalos'].values):
		x,y = th['halo_x'+str(i)].values, th['halo_y'+str(i)].values
		scatter(x/100, y/100, marker='x', s = 500, c='Y', linewidths=2)
		print x[0]/100,',',y[0]/100,',', s[ID][int(x/100)][int(y/100)]
		#ax.add_patch(Circle(xy=(x, y), fc='y', ec='none', alpha=0.5, radius=2*size_multiplier))    
	#ax.add_patch(Circle(xy=(100*halo[0], 100*halo[1]), fc=color, ec='none', alpha=0.5, radius=500))
	#ax.autoscale_view(tight=True)
	if filename is not None:
		fig.savefig(filename+'.pdf', format='pdf')#, bbox_inches='tight')

def test_calc_log_likelihood(df_test):
    tol=10**(-5)
    halo1 = np.array( [[4200, 4200, 1000, 200, 0]], dtype=np.float64)
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo1) - 107.65312710886943) <= tol
    halo2 = np.array( [[2400, 2400, 500, 100, 0]], dtype=np.float64)
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo2) - 116.10013304100042) <= tol
    halo3 = np.array( [[2400, 2400, 500, 100, 0.01]], dtype=np.float64)
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo3) - 116.53971627108918) <= tol
    halo4 = np.array( [[2400, 2400, 500, 100, 0.001]], dtype=np.float64)
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo4) - 116.15631624187554) <= tol
    halo5 = np.array( [[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001]], dtype=np.float64 )
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 40][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo5) - -47.3726778566) <= tol
    halo6 = np.array( [[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001]], dtype=np.float64 )
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 99][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo6) - -218.106874888) <= tol
    halo7 = np.array( [[2820.241455786800543, 2530.896772986271117, 95.463841074174709, 5.031270423604361, 0.],
       [1239.840315636071637, 2429.487190770937559, 980.917139013906194, 52.64664806466692, 0.],
       [108.199502973083582, 140.00882477450304 , 610.810091887177578, 107.024441568824273, 0.]])
    assert np.abs(calc_log_likelihood(df_test[df_test.ID==99][['ID','x','y','e1','e2','NumberHalos']].values, halo7) - 91.1307046217) <= tol

@joblib_cache.cache
def run_init(X):
    #X = df_test[ ['ID','x','y','e1','e2','NumberHalos'] ]
    #X = df[ ['ID','x','y','e1','e2','numberHalos'] ]
    best_halos, log_likelihoods = init(X.values)
    return best_halos, log_likelihoods

@joblib_cache.cache
def run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.019):
    predictions, samples = mcmc(X.values, best_halos, log_likelihoods, no_iterations, step_size)
    return predictions, samples

def test(ID, no_iterations=20000, step_size=0.019):
    X = df[df.ID==ID][['ID','x','y','e1','e2','numberHalos']]
    best_halos, log_likelihoods = init(X.values)
    predictions, samples = mcmc(X.values, best_halos, log_likelihoods, no_iterations, step_size)
    return predictions, samples

df, df_test, train_halos = load_data()

test_calc_log_likelihood(df_test)

# "training set"
df.head()

# "test set"
df_test.head()

# x and y are both uniformly distributed
# with mean ~2100 and std=~1210
# see below for distribution plots
no_halos = 1
print df[df.numberHalos == no_halos].x.mean()
print df[df.numberHalos == no_halos].x.std()
print df[df.numberHalos == no_halos].y.mean()
print df[df.numberHalos == no_halos].y.std()

plt.figure(figsize=(7,7))
df['x'].hist(color='k', alpha=0.5, bins=50)
plt.savefig('x_distribution.pdf', format='pdf')

plt.figure(figsize=(7,7))
df['y'].hist(color='k', alpha=0.5, bins=50)
plt.savefig('y_distribution.pdf', format='pdf')

plt.figure(figsize=(10,10))
plt.scatter( df['x'], df['y'], s=1, cmap=mpl.cm.gray )
plt.savefig('x_vs_y_distribution.pdf', format='pdf')

# e1 and e2 are both N(0, 0.22**2)
# independent of the number of halos
# see below for distribution plots
no_halos = 1
print df[df.numberHalos == no_halos].e1.mean()
print df[df.numberHalos == no_halos].e1.std()
print df[df.numberHalos == no_halos].e2.mean()
print df[df.numberHalos == no_halos].e2.std()

plt.figure(figsize=(7,7))
df['e1_norm'].hist(color='k', alpha=0.5, bins=100)
plt.savefig('e1_norm_distribution.pdf', format='pdf')

plt.figure(figsize=(7,7))
df['e2_norm'].hist(color='k', alpha=0.5, bins=100)
plt.savefig('e2_norm_distribution.pdf', format='pdf')

plt.figure(figsize=(7,7))
plt.scatter( df['e1'], df['e2'], s=1, cmap=mpl.cm.gray )
plt.savefig('e1_vs_e2_norm_distribution.pdf', format='pdf')

# 1 halo case
skyid='Sky99'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid])

# 2 halo case - check out those galaxies!
skyid='Sky116'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid], filename='Sky116')

# 3 halo case
skyid='Sky300'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid], filename='sky300')

draw_heatmap(99, df, train_halos, r=4000, filename='heatmap_99')

draw_heatmap(115, df, train_halos, r=4000, filename='heatmap116')

X = df[ ['ID','x','y','e1','e2','numberHalos'] ]

best_halos, log_likelihoods = run_init(X)

predictions, samples = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.019)

predictions2, samples2 = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.19)

predictions3, samples3 = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=1.9)

plt.scatter(samples[0][30000:].T[0], samples[0][5000:].T[1])

burnin = 40000
results = []
for SkyID in xrange(300):
    t = {}
    t['SkyId'] = 'Sky'+str(SkyID+1)
    for halo in [0,1,2]:
        t['halo_x'+str(halo+1)+'_prediction'] = samples[SkyID][burnin:].T[0][halo].mean()
        t['halo_y'+str(halo+1)+'_prediction'] = samples[SkyID][burnin:].T[1][halo].mean()
    results.append(t)

predictions_df = pd.DataFrame(results)
predictions_df = predictions_df[['SkyId','halo_x1_prediction','halo_y1_prediction','halo_x2_prediction','halo_y2_prediction','halo_x3_prediction','halo_y3_prediction']]

final_results = pd.merge(train_halos, predictions_df, on='SkyId')

final_results_1_halo = final_results[final_results.numberHalos == 1]
final_results_1_halo['x1_diff'] = final_results_1_halo['halo_x1'] - final_results_1_halo['halo_x1_prediction']
final_results_1_halo['y1_diff'] = final_results_1_halo['halo_y1'] - final_results_1_halo['halo_y1_prediction']

print final_results_1_halo['x1_diff'].mean()
print final_results_1_halo['x1_diff'].std()
final_results_1_halo['x1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x1_1halo_error.pdf', format='pdf')

print final_results_1_halo['y1_diff'].mean()
print final_results_1_halo['y1_diff'].std()
final_results_1_halo['y1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y1_1halo_error.pdf', format='pdf')

# don't know the proper halo ordering, so will assume that the predicted
# halos are closer to the actual halos, :p
final_results_2_halo = final_results[final_results.numberHalos == 2]
final_results_2_halo['x1_diff'] = np.minimum(final_results_2_halo['halo_x1'] - final_results_2_halo['halo_x1_prediction'], final_results_2_halo['halo_x1'] - final_results_2_halo['halo_x2_prediction'])
final_results_2_halo['y1_diff'] = np.minimum(final_results_2_halo['halo_y1'] - final_results_2_halo['halo_y1_prediction'], final_results_2_halo['halo_y1'] - final_results_2_halo['halo_y2_prediction'])
final_results_2_halo['x2_diff'] = np.minimum(final_results_2_halo['halo_x2'] - final_results_2_halo['halo_x1_prediction'], final_results_2_halo['halo_x2'] - final_results_2_halo['halo_x2_prediction'])
final_results_2_halo['y2_diff'] = np.minimum(final_results_2_halo['halo_y2'] - final_results_2_halo['halo_y1_prediction'], final_results_2_halo['halo_y2'] - final_results_2_halo['halo_y2_prediction'])

final_results_2_halo.head()

print final_results_2_halo['x1_diff'].mean()
print final_results_2_halo['x1_diff'].std()
final_results_2_halo['x1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x1_2halo_error.pdf', format='pdf')

print final_results_2_halo['y1_diff'].mean()
print final_results_2_halo['y1_diff'].std()
final_results_2_halo['y1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y1_1halo_error.pdf', format='pdf')

print final_results_2_halo['x2_diff'].mean()
print final_results_2_halo['x2_diff'].std()
final_results_2_halo['x2_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x2_1halo_error.pdf', format='pdf')

print final_results_2_halo['y2_diff'].mean()
print final_results_2_halo['y2_diff'].std()
final_results_2_halo['y2_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y2_1halo_error.pdf', format='pdf')
Clearly these solutions need to somehow be optimized...