In [3]:

from IPython.display import display
%load_ext retina
%load_ext autosave
%autosave 90

%pylab inline

Usage: %autosave [seconds]
autosaving every 90s

Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.

In [4]:

import os, sys, re, json, itertools, operator
import pandas as pd, numpy as np, scipy as sp
from matplotlib.patches import Ellipse, Circle
from joblib import Memory

%load_ext cythonmagic

In [5]:

data_directory = os.path.abspath(os.path.join(os.path.expanduser("~"), 'build', 'kaggle', 'DarkWorlds'))
output_directory = os.path.join(data_directory, 'output')

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

joblib_cache = Memory(cachedir=os.path.join(output_directory, 'joblib'), verbose=0)

pd.set_printoptions(notebook_repr_html=True, precision=4, max_columns=200, max_rows=50, max_colwidth=25, column_space=10)
np.set_printoptions(linewidth=200, precision=15, suppress=True)

/Users/vgoklani/anaconda/lib/python2.7/site-packages/pandas-0.10.0.dev_74ab638-py2.7-macosx-10.5-x86_64.egg/pandas/core/format.py:1264: FutureWarning: set_printoptions is deprecated, use set_option instead
  FutureWarning)

In [6]:

%%cython
import numpy as np

cimport numpy as np
#from libc.math cimport sqrt
cimport cython

ctypedef np.float64_t dtype_t

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_distance(np.ndarray[dtype_t, ndim=1] x0, np.ndarray[dtype_t, ndim=1] y0, np.ndarray[dtype_t, ndim=1] x1, np.ndarray[dtype_t, ndim=1] y1, dtype_t p=2.0):
	return ( (x0-x1)**p + (y0-y1)**p )**(1.0/p)

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef check_distance(np.ndarray[dtype_t, ndim=1] x0, np.ndarray[dtype_t, ndim=1] y0, dtype_t x1, dtype_t y1, dtype_t r):
	return ( ( (x0-x1)**2 + (y0-y1)**2 )**(0.5) <= r )

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, np.ndarray[dtype_t, ndim=1] x_halo, np.ndarray[dtype_t, ndim=1] y_halo):
	if x_halo.all() == 0.0 and y_halo.all() == 0.0:
		return 0
	cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
	return (-( e1*np.cos(2*phi) + e2*np.sin(2*phi) ))

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal_point(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, dtype_t x_halo, dtype_t y_halo):
	cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
	return (-( e1*np.cos(2*phi) + e2*np.sin(2*phi) )).mean()

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal_intensity(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, dtype_t x_halo, dtype_t y_halo, dtype_t p=2):
	cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
	cdef np.ndarray[dtype_t, ndim=1] signal = -( e1*np.cos(2*phi) + e2*np.sin(2*phi))
	cdef np.ndarray[dtype_t, ndim=1] distance = ( (x-x_halo)**2 + (y-y_halo)**2 )**(0.5)
	return (signal/(distance**0)).mean()

# @todo: rewrite this abomination...
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef get_max(np.ndarray[dtype_t, ndim=2] X, dtype_t r1=15, Py_ssize_t max_no_galaxies = 3, dtype_t r2=15):
	cdef:
		dtype_t curr_max1 = 0.0, curr_max2 = 0.0, curr_max3 = 0.0
		Py_ssize_t i, i_max1, i_max2, i_max3
		Py_ssize_t j, j_max1, j_max2, j_max3
	for i in xrange(X.shape[0]):
		for j in xrange(X.shape[1]):
			if X[i,j] > curr_max1:
				curr_max1 = X[i,j]
				i_max1 = i
				j_max1 = j
			elif max_no_galaxies > 1 and X[i,j] > curr_max2 and (((i_max1 - i)**2 + (j_max1 - j)**2) > r1**2):
				curr_max2 = X[i,j]
				i_max2 = i
				j_max2 = j
			elif max_no_galaxies > 2 and X[i,j] > curr_max3 and (((i_max1 - i)**2 + (j_max1 - j)**2) > r1**2) and (((i_max2 - i)**2 + (j_max2 - j)**2) > r2**2):
				curr_max3 = X[i,j]
				i_max3 = i
				j_max3 = j
	return [(i_max1, j_max1, curr_max1), (i_max2, j_max2, curr_max2), (i_max3, j_max3, curr_max3)][:max_no_galaxies]

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_strength_(np.ndarray[dtype_t, ndim=2] M, dtype_t r=5000):
	cdef:
		np.ndarray[dtype_t, ndim=3] s = np.zeros((300, 42, 42), dtype=np.float64)
		np.ndarray[dtype_t, ndim=2] w
		Py_ssize_t x0,y0, skyid
	for skyid in xrange(300):
		X = M[ M[:,0] == skyid ]
		for x0 in xrange(0,4200,100):
			for y0 in xrange(0,4200,100):
				w = X[check_distance( X[:,1], X[:,2], x0, y0, r)]
				if w.shape[0] > 0:
					s[skyid, int(x0/100.0), int(y0/100.0)] = calc_signal_intensity(w[:,1], w[:,2], w[:,3], w[:,4], x0, y0)
	return s.reshape( (300, -1) ) # (# of samples, # of features)

In [69]:

%%cython
import numpy as np

cimport numpy as np
from libc.math cimport sqrt
cimport cython

ctypedef np.float64_t dtype_t

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef generate_halos(np.int number_of_halos=1):
	cdef:
		np.ndarray[dtype_t, ndim=2] halos = np.zeros( (3, 5), dtype=np.float64)
	for i in xrange(number_of_halos):
		halos[i, 0] = np.random.random_sample() * 4200.0 # x
		halos[i, 1] = np.random.random_sample() * 4200.0 # y
		halos[i, 2] = np.random.random_sample() * 1000.0 # r0
		halos[i, 3] = np.random.random_sample() * 200.0  # m
		halos[i, 4] = np.random.random_sample() * 0.0    # std
	return halos

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef init(np.ndarray[dtype_t, ndim=2] data):
	cdef:
		np.int trial, no_halos
		np.ndarray[dtype_t, ndim=3] best_halos = np.zeros((300, 3, 5), dtype=np.float64)
		np.ndarray[dtype_t, ndim=2] log_likelihoods = np.zeros( (300, 2), dtype=np.float64)
		np.ndarray[dtype_t, ndim=2] X, halos
		dtype_t log_likelihood
		Py_ssize_t skyid
	
	for skyid in xrange(data[:,0].min(), data[:,0].max()+1):
		X = data[ data[:,0] == skyid ]	# fancy indexing -> calls copy constructor :(
		
		log_likelihoods[skyid, 0] = -np.float('inf')	# initialize log_likelihood to negative infinity
		log_likelihoods[skyid, 1] = 0	# initialize # of acceptances
		no_halos = np.int(X[skyid, 5])
		for trial in xrange(5000):
			halos = generate_halos( no_halos )
			log_likelihood = calc_log_likelihood(X, halos)
			
			if log_likelihood > log_likelihoods[skyid, 0]:
				log_likelihoods[skyid, 1] += 1
				best_halos[skyid] = halos
				log_likelihoods[skyid, 0] = log_likelihood
		#print 'finished initializing sky#%s -> %s accepted with loglikelihood=%s' % (str(skyid+1), str(log_likelihoods[skyid, 1]), str(log_likelihoods[skyid, 0]))
	return best_halos, log_likelihoods

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_log_likelihood(np.ndarray[dtype_t, ndim=2] X, np.ndarray[dtype_t, ndim=2] halos):
	cdef:
		dtype_t halo_predicted_x, halo_predicted_y, halo_predicted_r_cutoff, halo_predicted_r_mass, halo_predicted_std, e_std
		np.ndarray[dtype_t, ndim=1] predicted_var, e1_force, e2_force, phi, dist_halo1, r_effective, force,  
		Py_ssize_t i, j, k, N
		np.int number_halos = np.int(X[0,5])
	e_std = 0.22
	e1_force = np.zeros_like(X[:,1])
	e2_force = np.zeros_like(X[:,1])
	
    # model parameter constraints -> shamelessly stolen from Iain Murray
	if (np.any(halos[:,0] < 0) or np.any(halos[:,0] > 4200) or 
		np.any(halos[:,1] < 0) or np.any(halos[:,1] > 4200) or
		np.any(halos[:,2] < 0) or np.any(halos[:,2] > 1000) or
		np.any(halos[:,3] < 0) or np.any(halos[:,3] > 300)  or
		np.any(halos[:,4] < 0) or np.any(halos[:,4] > 0.3)):
		return -np.float('inf')

	for i in xrange(number_halos):
		halo = halos[i,:]

		halo_predicted_x = halo[0]
		halo_predicted_y = halo[1]
		halo_predicted_r_cutoff = halo[2]
		halo_predicted_mass = halo[3]
		halo_predicted_std = e_std + halo[4]
		
		predicted_var = (e_std * e_std) * np.ones_like(X[:,1])

		phi = np.arctan( (X[:,2] - halo_predicted_y) / (X[:,1] - halo_predicted_x) )
		dist_halo1 = ( (X[:,1] - halo_predicted_x)**2 + (X[:,2] - halo_predicted_y)**2 )**(0.5)
		r_effective = np.maximum(halo_predicted_r_cutoff, dist_halo1)
		force = halo_predicted_mass/(r_effective*1.0)
		
		e1_force += -force*np.cos(2*phi)
		e2_force += -force*np.sin(2*phi)

		predicted_var[dist_halo1 < halo_predicted_r_cutoff] = np.maximum( predicted_var[dist_halo1 < halo_predicted_r_cutoff], halo_predicted_std * halo_predicted_std)

	log_likelihood = np.sum( -0.5 * ( ((X[:,3]-e1_force)**2 + (X[:,4]-e2_force)**2 ) / (predicted_var)) - np.log(2*np.pi*predicted_var) )
	
	return log_likelihood

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef mcmc(np.ndarray[dtype_t, ndim=2] data, np.ndarray[dtype_t, ndim=3] best_halos, np.ndarray[dtype_t, ndim=2] log_likelihoods, np.int no_iterations, dtype_t stepsize):
	cdef:
		np.ndarray[dtype_t, ndim=2] proposed_halos = np.empty_like(best_halos[0])
		np.ndarray[dtype_t, ndim=4] samples = np.zeros( (300, no_iterations, 3, 5), dtype=np.float64 )
		np.ndarray[dtype_t, ndim=2] X
		np.ndarray[dtype_t, ndim=2] perturbation = np.zeros( (3,5), dtype=np.float64)
		dtype_t log_likelihood_proposed
		np.int number_halos, iteration
		Py_ssize_t skyid
	
	for skyid in xrange(data[:,0].min(), data[:,0].max()+1):
		X = data[ data[:,0] == skyid ]
		number_halos = np.int(X[0,5])
		log_likelihoods[skyid, 1] = 0
	
		for iteration in xrange(no_iterations): # stepsize should be different for each variable type!
			perturbation[:number_halos] = stepsize*np.random.randn(number_halos,5)
			proposed_halos = perturbation + best_halos[skyid]
			log_likelihood_proposed = calc_log_likelihood(X, proposed_halos)
			if np.log( np.random.random_sample() ) < (log_likelihood_proposed - log_likelihoods[skyid, 0]):
				log_likelihoods[skyid, 1] += 1
				log_likelihoods[skyid, 0] = log_likelihood_proposed
				best_halos[skyid] = proposed_halos
			samples[skyid][iteration] = proposed_halos
		#print 'finished processing skyid%s' % str(skyid+1)
		#print '\tacceptance ratio = %s / %s\tlog_likelihood = %s' % (str(log_likelihoods[skyid, 1]), str(no_iterations), str(log_likelihoods[skyid, 0]) )

	return best_halos, samples

In [8]:

@joblib_cache.cache
def load_data():
	train_halos = pd.read_csv( os.path.join(data_directory, 'Training_halos.csv') )
	test_halo_counts = pd.read_csv( os.path.join(data_directory, 'Test_haloCounts.csv') )

	train_skies = pd.DataFrame()
	for s in os.listdir( os.path.join(data_directory, 'Train_Skies') ):
		try:
			d = pd.read_csv( os.path.join(data_directory, 'Train_Skies', s) )
			d['SkyId'] = s.split("_")[1].split(".")[0]
			d['ID'] = d['SkyId'].map(lambda x: int( str(x).split('y')[1]) )-1
			d['a'] = 1/(1-np.sqrt(d['e1']**2 + d['e2']**2))
			d['b'] = 1/(1+np.sqrt(d['e1']**2 + d['e2']**2))
			d['theta'] = np.degrees(np.arctan2(d['e2'], d['e1'])*0.5)
			d['e1_norm'] = (d.e1-d.e1.mean()) / d.e1.std()
			d['e2_norm'] = (d.e2-d.e2.mean()) / d.e2.std()
			train_skies = train_skies.append(d)
		except Exception as e:
			print s,' -> ', e.message

	test_skies = pd.DataFrame()
	for s in os.listdir( os.path.join(data_directory, 'Test_Skies') ):
		try:
			d = pd.read_csv( os.path.join(data_directory, 'Test_Skies', s) )
			d['SkyId'] = s.split("_")[1].split(".")[0]
			d['ID'] = d['SkyId'].map(lambda x: int( str(x).split('y')[1]) )-1
			d['a'] = 1/(1-np.sqrt(d['e1']**2 + d['e2']**2))
			d['b'] = 1/(1+np.sqrt(d['e1']**2 + d['e2']**2))
			d['theta'] = np.degrees(np.arctan2(d['e2'], d['e1'])*0.5)
			d['e1_norm'] = (d.e1-d.e1.mean()) / d.e1.std()
			d['e2_norm'] = (d.e2-d.e2.mean()) / d.e2.std()
			test_skies = test_skies.append(d)
		except Exception as e:
			print s,' -> ', e.message

	df = pd.merge(train_skies, train_halos, on='SkyId')
	df_test = pd.merge(test_skies, test_halo_counts, on='SkyId')

	return df, df_test, train_halos

In [9]:

@joblib_cache.cache
def calc_strength(X, r):
    return calc_strength_(X,r)

In [55]:

def draw_sky(X, train_halos, calculated_halos = None, size_multiplier=25, filename=None):
	fig = plt.figure(figsize=(10,10))
	ax = fig.add_subplot(111, aspect='equal')
	for row in xrange(X.index.min().values, X.index.max().values):
		try:
			x,y,a,b,theta = X.ix[row][ ['x','y','a','b','theta'] ].values
			galaxy = Ellipse(xy=(x, y), width=size_multiplier*a, height=size_multiplier*b, angle=theta, fc='none')
			ax.add_patch(galaxy)
		except:
			print row
	for i in range(1,1+train_halos['numberHalos'].values):
		x,y = train_halos['halo_x'+str(i)].values, train_halos['halo_y'+str(i)].values
		print x,y
		ax.add_patch(Circle(xy=(x, y), fc='r', ec='none', alpha=0.5, radius=3*size_multiplier))
	ax.autoscale_view(tight=True)
	plt.show()
	if filename is not None:
		fig.savefig(filename+'.pdf', format='pdf')#, bbox_inches='tight')

In [54]:

# the heatmap below shows the strength of the signal intensity
# due to the presence of a halo. 
def draw_heatmap(ID, df, train_halos, r=2000, filename=None):
	skyid='Sky'+str(ID+1)
	X = df[ ['ID','x','y','e1','e2','a','b','theta'] ].values
	strength = calc_strength(X, r=r)
	#print strength[np.isnan(strength)].shape
	s = strength.reshape( (300,42,42) )
	fig = plt.figure(figsize=(14,14))
	imgplot = imshow(asarray(s[ID].T), origin='lower')
	number_of_halos = train_halos[ train_halos.SkyId == skyid]['numberHalos'].values[0]

	imgplot.set_cmap('spectral')
	plt.colorbar()
	colors = itertools.cycle('rgbcmykw')

	for color, halo in zip(colors, get_max(s[ID], max_no_galaxies=number_of_halos)):
		print halo
		scatter(halo[0], halo[1], marker='o', s = 500, c=color, linewidths=2)

	th = train_halos[train_halos.SkyId == 'Sky'+str(ID+1)]
	for i in range(1,1+th['numberHalos'].values):
		x,y = th['halo_x'+str(i)].values, th['halo_y'+str(i)].values
		scatter(x/100, y/100, marker='x', s = 500, c='Y', linewidths=2)
		print x[0]/100,',',y[0]/100,',', s[ID][int(x/100)][int(y/100)]
		#ax.add_patch(Circle(xy=(x, y), fc='y', ec='none', alpha=0.5, radius=2*size_multiplier))    
	#ax.add_patch(Circle(xy=(100*halo[0], 100*halo[1]), fc=color, ec='none', alpha=0.5, radius=500))
	#ax.autoscale_view(tight=True)
	if filename is not None:
		fig.savefig(filename+'.pdf', format='pdf')#, bbox_inches='tight')

In [12]:

def test_calc_log_likelihood(df_test):
    tol=10**(-5)
    halo1 = np.array( [[4200, 4200, 1000, 200, 0]], dtype=np.float64)
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo1) - 107.65312710886943) <= tol
    halo2 = np.array( [[2400, 2400, 500, 100, 0]], dtype=np.float64)
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo2) - 116.10013304100042) <= tol
    halo3 = np.array( [[2400, 2400, 500, 100, 0.01]], dtype=np.float64)
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo3) - 116.53971627108918) <= tol
    halo4 = np.array( [[2400, 2400, 500, 100, 0.001]], dtype=np.float64)
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo4) - 116.15631624187554) <= tol
    halo5 = np.array( [[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001]], dtype=np.float64 )
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 40][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo5) - -47.3726778566) <= tol
    halo6 = np.array( [[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001]], dtype=np.float64 )
    assert np.abs(calc_log_likelihood(df_test[df_test.ID == 99][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo6) - -218.106874888) <= tol
    halo7 = np.array( [[2820.241455786800543, 2530.896772986271117, 95.463841074174709, 5.031270423604361, 0.],
       [1239.840315636071637, 2429.487190770937559, 980.917139013906194, 52.64664806466692, 0.],
       [108.199502973083582, 140.00882477450304 , 610.810091887177578, 107.024441568824273, 0.]])
    assert np.abs(calc_log_likelihood(df_test[df_test.ID==99][['ID','x','y','e1','e2','NumberHalos']].values, halo7) - 91.1307046217) <= tol

In [13]:

@joblib_cache.cache
def run_init(X):
    #X = df_test[ ['ID','x','y','e1','e2','NumberHalos'] ]
    #X = df[ ['ID','x','y','e1','e2','numberHalos'] ]
    best_halos, log_likelihoods = init(X.values)
    return best_halos, log_likelihoods

In [14]:

@joblib_cache.cache
def run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.019):
    predictions, samples = mcmc(X.values, best_halos, log_likelihoods, no_iterations, step_size)
    return predictions, samples

In [15]:

def test(ID, no_iterations=20000, step_size=0.019):
    X = df[df.ID==ID][['ID','x','y','e1','e2','numberHalos']]
    best_halos, log_likelihoods = init(X.values)
    predictions, samples = mcmc(X.values, best_halos, log_likelihoods, no_iterations, step_size)
    return predictions, samples

See below for Data Analysis¶

In [16]:

df, df_test, train_halos = load_data()

.DS_Store  ->  no item named e1
.DS_Store  ->  no item named e1

In [17]:

test_calc_log_likelihood(df_test)

In [18]:

# "training set"
df.head()

Out[18]:

	GalaxyID	SkyId	a	b	e1	e1_norm	e2	e2_norm	theta	x	y	numberHalos	x_ref	y_ref	halo_x1	halo_y1
0	Galaxy1	Sky1	1.440	0.766	-0.051	-0.111	-0.301	-1.366	-49.765	700.87	89.98	1	1086.8	1114.61	1086.8	1114.61
1	Galaxy2	Sky1	1.341	0.797	-0.249	-1.005	0.052	0.369	84.096	570.74	3528.23	1	1086.8	1114.61	1086.8	1114.61
2	Galaxy3	Sky1	1.952	0.672	0.484	2.296	0.061	0.413	3.587	579.66	1381.65	1	1086.8	1114.61	1086.8	1114.61
3	Galaxy4	Sky1	1.270	0.825	-0.043	-0.079	0.208	1.135	50.905	1595.64	1226.96	1	1086.8	1114.61	1086.8	1114.61
4	Galaxy5	Sky1	1.300	0.812	-0.102	-0.343	-0.207	-0.904	-58.101	444.24	1567.29	1	1086.8	1114.61	1086.8	1114.61

In [19]:

# "test set"
df_test.head()

Out[19]:

	GalaxyID	SkyId	a	b	e1	e1_norm	e2	e2_norm	theta	x	y	NumberHalos
0	Galaxy1	Sky1	1.825	0.689	-0.392	-1.881	0.225	1.053	75.062	3225.83	1229.26	1
1	Galaxy2	Sky1	1.050	0.955	-0.032	-0.128	-0.035	-0.144	-66.494	2381.05	2629.69	1
2	Galaxy3	Sky1	1.192	0.862	0.159	0.808	0.020	0.109	3.591	486.04	2901.77	1
3	Galaxy4	Sky1	1.593	0.729	-0.332	-1.590	0.168	0.792	76.559	938.82	2202.68	1
4	Galaxy5	Sky1	1.244	0.836	0.126	0.643	0.151	0.710	25.075	1746.20	2401.99	1

Let's analyze the distribution of galaxies¶

need to look at both x vs y and e1 vs e2¶

In [20]:

# x and y are both uniformly distributed
# with mean ~2100 and std=~1210
# see below for distribution plots
no_halos = 1
print df[df.numberHalos == no_halos].x.mean()
print df[df.numberHalos == no_halos].x.std()
print df[df.numberHalos == no_halos].y.mean()
print df[df.numberHalos == no_halos].y.std()

2109.85228474
1211.24752398
2107.21994394
1209.03071121

In [222]:

plt.figure(figsize=(7,7))
df['x'].hist(color='k', alpha=0.5, bins=50)
plt.savefig('x_distribution.pdf', format='pdf')

In [223]:

plt.figure(figsize=(7,7))
df['y'].hist(color='k', alpha=0.5, bins=50)
plt.savefig('y_distribution.pdf', format='pdf')

In [224]:

plt.figure(figsize=(10,10))
plt.scatter( df['x'], df['y'], s=1, cmap=mpl.cm.gray )
plt.savefig('x_vs_y_distribution.pdf', format='pdf')

In [24]:

# e1 and e2 are both N(0, 0.22**2)
# independent of the number of halos
# see below for distribution plots
no_halos = 1
print df[df.numberHalos == no_halos].e1.mean()
print df[df.numberHalos == no_halos].e1.std()
print df[df.numberHalos == no_halos].e2.mean()
print df[df.numberHalos == no_halos].e2.std()

0.00106709782568
0.221708498853
-0.000476517534257
0.219473684278

In [225]:

plt.figure(figsize=(7,7))
df['e1_norm'].hist(color='k', alpha=0.5, bins=100)
plt.savefig('e1_norm_distribution.pdf', format='pdf')

In [226]:

plt.figure(figsize=(7,7))
df['e2_norm'].hist(color='k', alpha=0.5, bins=100)
plt.savefig('e2_norm_distribution.pdf', format='pdf')

In [227]:

plt.figure(figsize=(7,7))
plt.scatter( df['e1'], df['e2'], s=1, cmap=mpl.cm.gray )
plt.savefig('e1_vs_e2_norm_distribution.pdf', format='pdf')

Sky Charts...¶

In [78]:

# 1 halo case
skyid='Sky99'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid])

[ 3294.2199999999998] [ 2646.389999999999873]

In [77]:

# 2 halo case - check out those galaxies!
skyid='Sky116'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid], filename='Sky116')

[ 3333.7800000000002] [ 3487.480000000000018]
[ 1327.279999999999973] [ 4020.019999999999982]

In [76]:

# 3 halo case
skyid='Sky300'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid], filename='sky300')

[ 2502.949999999999818] [ 590.700000000000045]
[ 3341.2800000000002] [ 3204.820000000000164]
[ 773.75] [ 1877.490000000000009]

Intensity Plots¶

In [74]:

draw_heatmap(99, df, train_halos, r=4000, filename='heatmap_99')

(38, 9, 0.06801655884314046)
38.6867 , 9.7809 , 0.0680165588431

In [75]:

draw_heatmap(115, df, train_halos, r=4000, filename='heatmap116')

(33, 35, 0.10854675558956287)
(16, 38, 0.020197036749682266)
33.3378 , 34.8748 , 0.102692526313
13.2728 , 40.2002 , 0.0123323601289

Simulation time...¶

In [62]:

X = df[ ['ID','x','y','e1','e2','numberHalos'] ]

In [66]:

best_halos, log_likelihoods = run_init(X)

In [220]:

predictions, samples = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.019)

In [68]:

predictions2, samples2 = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.19)

In [70]:

predictions3, samples3 = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=1.9)

In [52]:

plt.scatter(samples[0][30000:].T[0], samples[0][5000:].T[1])

Out[52]:

<matplotlib.collections.PathCollection at 0x10d845390>

Results¶

In [174]:

burnin = 40000
results = []
for SkyID in xrange(300):
    t = {}
    t['SkyId'] = 'Sky'+str(SkyID+1)
    for halo in [0,1,2]:
        t['halo_x'+str(halo+1)+'_prediction'] = samples[SkyID][burnin:].T[0][halo].mean()
        t['halo_y'+str(halo+1)+'_prediction'] = samples[SkyID][burnin:].T[1][halo].mean()
    results.append(t)

In [204]:

predictions_df = pd.DataFrame(results)
predictions_df = predictions_df[['SkyId','halo_x1_prediction','halo_y1_prediction','halo_x2_prediction','halo_y2_prediction','halo_x3_prediction','halo_y3_prediction']]

final_results = pd.merge(train_halos, predictions_df, on='SkyId')

In [205]:

final_results_1_halo = final_results[final_results.numberHalos == 1]
final_results_1_halo['x1_diff'] = final_results_1_halo['halo_x1'] - final_results_1_halo['halo_x1_prediction']
final_results_1_halo['y1_diff'] = final_results_1_halo['halo_y1'] - final_results_1_halo['halo_y1_prediction']

In [228]:

print final_results_1_halo['x1_diff'].mean()
print final_results_1_halo['x1_diff'].std()
final_results_1_halo['x1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x1_1halo_error.pdf', format='pdf')

-9.11318578604
322.716605257

In [229]:

print final_results_1_halo['y1_diff'].mean()
print final_results_1_halo['y1_diff'].std()
final_results_1_halo['y1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y1_1halo_error.pdf', format='pdf')

-14.3968733588
216.444298644

In [216]:

# don't know the proper halo ordering, so will assume that the predicted
# halos are closer to the actual halos, :p
final_results_2_halo = final_results[final_results.numberHalos == 2]
final_results_2_halo['x1_diff'] = np.minimum(final_results_2_halo['halo_x1'] - final_results_2_halo['halo_x1_prediction'], final_results_2_halo['halo_x1'] - final_results_2_halo['halo_x2_prediction'])
final_results_2_halo['y1_diff'] = np.minimum(final_results_2_halo['halo_y1'] - final_results_2_halo['halo_y1_prediction'], final_results_2_halo['halo_y1'] - final_results_2_halo['halo_y2_prediction'])
final_results_2_halo['x2_diff'] = np.minimum(final_results_2_halo['halo_x2'] - final_results_2_halo['halo_x1_prediction'], final_results_2_halo['halo_x2'] - final_results_2_halo['halo_x2_prediction'])
final_results_2_halo['y2_diff'] = np.minimum(final_results_2_halo['halo_y2'] - final_results_2_halo['halo_y1_prediction'], final_results_2_halo['halo_y2'] - final_results_2_halo['halo_y2_prediction'])

In [217]:

final_results_2_halo.head()

Out[217]:

	SkyId	numberHalos	x_ref	y_ref	halo_x1	halo_y1	halo_x2	halo_y2	halo_x1_prediction	halo_y1_prediction	halo_x2_prediction	halo_y2_prediction	x1_diff	y1_diff	x2_diff	y2_diff
100	Sky101	2	3136.47	932.52	4093.96	345.65	1604.48	1871.51	3589.258	99.660	4118.122	109.834	-24.162	235.816	-2513.642	1761.676
101	Sky102	2	266.16	721.01	19.77	790.82	2591.24	62.25	1830.988	515.728	52.285	672.802	-1811.218	118.018	760.252	-610.552
102	Sky103	2	1797.30	2446.26	1775.04	2459.69	3644.59	1331.22	2831.102	3551.173	1824.692	2449.746	-1056.062	-1091.483	813.488	-2219.953
103	Sky104	2	506.86	2603.82	527.74	2497.54	332.85	3489.45	617.555	2524.441	3216.009	2226.939	-2688.269	-26.901	-2883.159	965.009
104	Sky105	2	1627.06	2360.39	1084.54	2839.79	3245.09	930.61	3830.695	3839.146	1494.022	551.490	-2746.155	-999.356	-585.605	-2908.536

In [230]:

print final_results_2_halo['x1_diff'].mean()
print final_results_2_halo['x1_diff'].std()
final_results_2_halo['x1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x1_2halo_error.pdf', format='pdf')

-789.624695689
1184.50692579

In [231]:

print final_results_2_halo['y1_diff'].mean()
print final_results_2_halo['y1_diff'].std()
final_results_2_halo['y1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y1_1halo_error.pdf', format='pdf')

-717.37000767
1067.7849683

In [232]:

print final_results_2_halo['x2_diff'].mean()
print final_results_2_halo['x2_diff'].std()
final_results_2_halo['x2_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x2_1halo_error.pdf', format='pdf')

-813.816195689
1364.53121299

In [233]:

print final_results_2_halo['y2_diff'].mean()
print final_results_2_halo['y2_diff'].std()
final_results_2_halo['y2_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y2_1halo_error.pdf', format='pdf')

-780.52580767
1466.65282258