from IPython.display import display
%load_ext retina
%load_ext autosave
%autosave 90
%pylab inline
Usage: %autosave [seconds] autosaving every 90s
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
import os, sys, re, json, itertools, operator
import pandas as pd, numpy as np, scipy as sp
from matplotlib.patches import Ellipse, Circle
from joblib import Memory
%load_ext cythonmagic
data_directory = os.path.abspath(os.path.join(os.path.expanduser("~"), 'build', 'kaggle', 'DarkWorlds'))
output_directory = os.path.join(data_directory, 'output')
if not os.path.exists(output_directory):
os.makedirs(output_directory)
joblib_cache = Memory(cachedir=os.path.join(output_directory, 'joblib'), verbose=0)
pd.set_printoptions(notebook_repr_html=True, precision=4, max_columns=200, max_rows=50, max_colwidth=25, column_space=10)
np.set_printoptions(linewidth=200, precision=15, suppress=True)
/Users/vgoklani/anaconda/lib/python2.7/site-packages/pandas-0.10.0.dev_74ab638-py2.7-macosx-10.5-x86_64.egg/pandas/core/format.py:1264: FutureWarning: set_printoptions is deprecated, use set_option instead FutureWarning)
%%cython
import numpy as np
cimport numpy as np
#from libc.math cimport sqrt
cimport cython
ctypedef np.float64_t dtype_t
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_distance(np.ndarray[dtype_t, ndim=1] x0, np.ndarray[dtype_t, ndim=1] y0, np.ndarray[dtype_t, ndim=1] x1, np.ndarray[dtype_t, ndim=1] y1, dtype_t p=2.0):
return ( (x0-x1)**p + (y0-y1)**p )**(1.0/p)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef check_distance(np.ndarray[dtype_t, ndim=1] x0, np.ndarray[dtype_t, ndim=1] y0, dtype_t x1, dtype_t y1, dtype_t r):
return ( ( (x0-x1)**2 + (y0-y1)**2 )**(0.5) <= r )
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, np.ndarray[dtype_t, ndim=1] x_halo, np.ndarray[dtype_t, ndim=1] y_halo):
if x_halo.all() == 0.0 and y_halo.all() == 0.0:
return 0
cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
return (-( e1*np.cos(2*phi) + e2*np.sin(2*phi) ))
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal_point(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, dtype_t x_halo, dtype_t y_halo):
cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
return (-( e1*np.cos(2*phi) + e2*np.sin(2*phi) )).mean()
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_signal_intensity(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, dtype_t x_halo, dtype_t y_halo, dtype_t p=2):
cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) )
cdef np.ndarray[dtype_t, ndim=1] signal = -( e1*np.cos(2*phi) + e2*np.sin(2*phi))
cdef np.ndarray[dtype_t, ndim=1] distance = ( (x-x_halo)**2 + (y-y_halo)**2 )**(0.5)
return (signal/(distance**0)).mean()
# @todo: rewrite this abomination...
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef get_max(np.ndarray[dtype_t, ndim=2] X, dtype_t r1=15, Py_ssize_t max_no_galaxies = 3, dtype_t r2=15):
cdef:
dtype_t curr_max1 = 0.0, curr_max2 = 0.0, curr_max3 = 0.0
Py_ssize_t i, i_max1, i_max2, i_max3
Py_ssize_t j, j_max1, j_max2, j_max3
for i in xrange(X.shape[0]):
for j in xrange(X.shape[1]):
if X[i,j] > curr_max1:
curr_max1 = X[i,j]
i_max1 = i
j_max1 = j
elif max_no_galaxies > 1 and X[i,j] > curr_max2 and (((i_max1 - i)**2 + (j_max1 - j)**2) > r1**2):
curr_max2 = X[i,j]
i_max2 = i
j_max2 = j
elif max_no_galaxies > 2 and X[i,j] > curr_max3 and (((i_max1 - i)**2 + (j_max1 - j)**2) > r1**2) and (((i_max2 - i)**2 + (j_max2 - j)**2) > r2**2):
curr_max3 = X[i,j]
i_max3 = i
j_max3 = j
return [(i_max1, j_max1, curr_max1), (i_max2, j_max2, curr_max2), (i_max3, j_max3, curr_max3)][:max_no_galaxies]
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_strength_(np.ndarray[dtype_t, ndim=2] M, dtype_t r=5000):
cdef:
np.ndarray[dtype_t, ndim=3] s = np.zeros((300, 42, 42), dtype=np.float64)
np.ndarray[dtype_t, ndim=2] w
Py_ssize_t x0,y0, skyid
for skyid in xrange(300):
X = M[ M[:,0] == skyid ]
for x0 in xrange(0,4200,100):
for y0 in xrange(0,4200,100):
w = X[check_distance( X[:,1], X[:,2], x0, y0, r)]
if w.shape[0] > 0:
s[skyid, int(x0/100.0), int(y0/100.0)] = calc_signal_intensity(w[:,1], w[:,2], w[:,3], w[:,4], x0, y0)
return s.reshape( (300, -1) ) # (# of samples, # of features)
%%cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt
cimport cython
ctypedef np.float64_t dtype_t
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef generate_halos(np.int number_of_halos=1):
cdef:
np.ndarray[dtype_t, ndim=2] halos = np.zeros( (3, 5), dtype=np.float64)
for i in xrange(number_of_halos):
halos[i, 0] = np.random.random_sample() * 4200.0 # x
halos[i, 1] = np.random.random_sample() * 4200.0 # y
halos[i, 2] = np.random.random_sample() * 1000.0 # r0
halos[i, 3] = np.random.random_sample() * 200.0 # m
halos[i, 4] = np.random.random_sample() * 0.0 # std
return halos
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef init(np.ndarray[dtype_t, ndim=2] data):
cdef:
np.int trial, no_halos
np.ndarray[dtype_t, ndim=3] best_halos = np.zeros((300, 3, 5), dtype=np.float64)
np.ndarray[dtype_t, ndim=2] log_likelihoods = np.zeros( (300, 2), dtype=np.float64)
np.ndarray[dtype_t, ndim=2] X, halos
dtype_t log_likelihood
Py_ssize_t skyid
for skyid in xrange(data[:,0].min(), data[:,0].max()+1):
X = data[ data[:,0] == skyid ] # fancy indexing -> calls copy constructor :(
log_likelihoods[skyid, 0] = -np.float('inf') # initialize log_likelihood to negative infinity
log_likelihoods[skyid, 1] = 0 # initialize # of acceptances
no_halos = np.int(X[skyid, 5])
for trial in xrange(5000):
halos = generate_halos( no_halos )
log_likelihood = calc_log_likelihood(X, halos)
if log_likelihood > log_likelihoods[skyid, 0]:
log_likelihoods[skyid, 1] += 1
best_halos[skyid] = halos
log_likelihoods[skyid, 0] = log_likelihood
#print 'finished initializing sky#%s -> %s accepted with loglikelihood=%s' % (str(skyid+1), str(log_likelihoods[skyid, 1]), str(log_likelihoods[skyid, 0]))
return best_halos, log_likelihoods
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef calc_log_likelihood(np.ndarray[dtype_t, ndim=2] X, np.ndarray[dtype_t, ndim=2] halos):
cdef:
dtype_t halo_predicted_x, halo_predicted_y, halo_predicted_r_cutoff, halo_predicted_r_mass, halo_predicted_std, e_std
np.ndarray[dtype_t, ndim=1] predicted_var, e1_force, e2_force, phi, dist_halo1, r_effective, force,
Py_ssize_t i, j, k, N
np.int number_halos = np.int(X[0,5])
e_std = 0.22
e1_force = np.zeros_like(X[:,1])
e2_force = np.zeros_like(X[:,1])
# model parameter constraints -> shamelessly stolen from Iain Murray
if (np.any(halos[:,0] < 0) or np.any(halos[:,0] > 4200) or
np.any(halos[:,1] < 0) or np.any(halos[:,1] > 4200) or
np.any(halos[:,2] < 0) or np.any(halos[:,2] > 1000) or
np.any(halos[:,3] < 0) or np.any(halos[:,3] > 300) or
np.any(halos[:,4] < 0) or np.any(halos[:,4] > 0.3)):
return -np.float('inf')
for i in xrange(number_halos):
halo = halos[i,:]
halo_predicted_x = halo[0]
halo_predicted_y = halo[1]
halo_predicted_r_cutoff = halo[2]
halo_predicted_mass = halo[3]
halo_predicted_std = e_std + halo[4]
predicted_var = (e_std * e_std) * np.ones_like(X[:,1])
phi = np.arctan( (X[:,2] - halo_predicted_y) / (X[:,1] - halo_predicted_x) )
dist_halo1 = ( (X[:,1] - halo_predicted_x)**2 + (X[:,2] - halo_predicted_y)**2 )**(0.5)
r_effective = np.maximum(halo_predicted_r_cutoff, dist_halo1)
force = halo_predicted_mass/(r_effective*1.0)
e1_force += -force*np.cos(2*phi)
e2_force += -force*np.sin(2*phi)
predicted_var[dist_halo1 < halo_predicted_r_cutoff] = np.maximum( predicted_var[dist_halo1 < halo_predicted_r_cutoff], halo_predicted_std * halo_predicted_std)
log_likelihood = np.sum( -0.5 * ( ((X[:,3]-e1_force)**2 + (X[:,4]-e2_force)**2 ) / (predicted_var)) - np.log(2*np.pi*predicted_var) )
return log_likelihood
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef mcmc(np.ndarray[dtype_t, ndim=2] data, np.ndarray[dtype_t, ndim=3] best_halos, np.ndarray[dtype_t, ndim=2] log_likelihoods, np.int no_iterations, dtype_t stepsize):
cdef:
np.ndarray[dtype_t, ndim=2] proposed_halos = np.empty_like(best_halos[0])
np.ndarray[dtype_t, ndim=4] samples = np.zeros( (300, no_iterations, 3, 5), dtype=np.float64 )
np.ndarray[dtype_t, ndim=2] X
np.ndarray[dtype_t, ndim=2] perturbation = np.zeros( (3,5), dtype=np.float64)
dtype_t log_likelihood_proposed
np.int number_halos, iteration
Py_ssize_t skyid
for skyid in xrange(data[:,0].min(), data[:,0].max()+1):
X = data[ data[:,0] == skyid ]
number_halos = np.int(X[0,5])
log_likelihoods[skyid, 1] = 0
for iteration in xrange(no_iterations): # stepsize should be different for each variable type!
perturbation[:number_halos] = stepsize*np.random.randn(number_halos,5)
proposed_halos = perturbation + best_halos[skyid]
log_likelihood_proposed = calc_log_likelihood(X, proposed_halos)
if np.log( np.random.random_sample() ) < (log_likelihood_proposed - log_likelihoods[skyid, 0]):
log_likelihoods[skyid, 1] += 1
log_likelihoods[skyid, 0] = log_likelihood_proposed
best_halos[skyid] = proposed_halos
samples[skyid][iteration] = proposed_halos
#print 'finished processing skyid%s' % str(skyid+1)
#print '\tacceptance ratio = %s / %s\tlog_likelihood = %s' % (str(log_likelihoods[skyid, 1]), str(no_iterations), str(log_likelihoods[skyid, 0]) )
return best_halos, samples
@joblib_cache.cache
def load_data():
train_halos = pd.read_csv( os.path.join(data_directory, 'Training_halos.csv') )
test_halo_counts = pd.read_csv( os.path.join(data_directory, 'Test_haloCounts.csv') )
train_skies = pd.DataFrame()
for s in os.listdir( os.path.join(data_directory, 'Train_Skies') ):
try:
d = pd.read_csv( os.path.join(data_directory, 'Train_Skies', s) )
d['SkyId'] = s.split("_")[1].split(".")[0]
d['ID'] = d['SkyId'].map(lambda x: int( str(x).split('y')[1]) )-1
d['a'] = 1/(1-np.sqrt(d['e1']**2 + d['e2']**2))
d['b'] = 1/(1+np.sqrt(d['e1']**2 + d['e2']**2))
d['theta'] = np.degrees(np.arctan2(d['e2'], d['e1'])*0.5)
d['e1_norm'] = (d.e1-d.e1.mean()) / d.e1.std()
d['e2_norm'] = (d.e2-d.e2.mean()) / d.e2.std()
train_skies = train_skies.append(d)
except Exception as e:
print s,' -> ', e.message
test_skies = pd.DataFrame()
for s in os.listdir( os.path.join(data_directory, 'Test_Skies') ):
try:
d = pd.read_csv( os.path.join(data_directory, 'Test_Skies', s) )
d['SkyId'] = s.split("_")[1].split(".")[0]
d['ID'] = d['SkyId'].map(lambda x: int( str(x).split('y')[1]) )-1
d['a'] = 1/(1-np.sqrt(d['e1']**2 + d['e2']**2))
d['b'] = 1/(1+np.sqrt(d['e1']**2 + d['e2']**2))
d['theta'] = np.degrees(np.arctan2(d['e2'], d['e1'])*0.5)
d['e1_norm'] = (d.e1-d.e1.mean()) / d.e1.std()
d['e2_norm'] = (d.e2-d.e2.mean()) / d.e2.std()
test_skies = test_skies.append(d)
except Exception as e:
print s,' -> ', e.message
df = pd.merge(train_skies, train_halos, on='SkyId')
df_test = pd.merge(test_skies, test_halo_counts, on='SkyId')
return df, df_test, train_halos
@joblib_cache.cache
def calc_strength(X, r):
return calc_strength_(X,r)
def draw_sky(X, train_halos, calculated_halos = None, size_multiplier=25, filename=None):
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, aspect='equal')
for row in xrange(X.index.min().values, X.index.max().values):
try:
x,y,a,b,theta = X.ix[row][ ['x','y','a','b','theta'] ].values
galaxy = Ellipse(xy=(x, y), width=size_multiplier*a, height=size_multiplier*b, angle=theta, fc='none')
ax.add_patch(galaxy)
except:
print row
for i in range(1,1+train_halos['numberHalos'].values):
x,y = train_halos['halo_x'+str(i)].values, train_halos['halo_y'+str(i)].values
print x,y
ax.add_patch(Circle(xy=(x, y), fc='r', ec='none', alpha=0.5, radius=3*size_multiplier))
ax.autoscale_view(tight=True)
plt.show()
if filename is not None:
fig.savefig(filename+'.pdf', format='pdf')#, bbox_inches='tight')
# the heatmap below shows the strength of the signal intensity
# due to the presence of a halo.
def draw_heatmap(ID, df, train_halos, r=2000, filename=None):
skyid='Sky'+str(ID+1)
X = df[ ['ID','x','y','e1','e2','a','b','theta'] ].values
strength = calc_strength(X, r=r)
#print strength[np.isnan(strength)].shape
s = strength.reshape( (300,42,42) )
fig = plt.figure(figsize=(14,14))
imgplot = imshow(asarray(s[ID].T), origin='lower')
number_of_halos = train_halos[ train_halos.SkyId == skyid]['numberHalos'].values[0]
imgplot.set_cmap('spectral')
plt.colorbar()
colors = itertools.cycle('rgbcmykw')
for color, halo in zip(colors, get_max(s[ID], max_no_galaxies=number_of_halos)):
print halo
scatter(halo[0], halo[1], marker='o', s = 500, c=color, linewidths=2)
th = train_halos[train_halos.SkyId == 'Sky'+str(ID+1)]
for i in range(1,1+th['numberHalos'].values):
x,y = th['halo_x'+str(i)].values, th['halo_y'+str(i)].values
scatter(x/100, y/100, marker='x', s = 500, c='Y', linewidths=2)
print x[0]/100,',',y[0]/100,',', s[ID][int(x/100)][int(y/100)]
#ax.add_patch(Circle(xy=(x, y), fc='y', ec='none', alpha=0.5, radius=2*size_multiplier))
#ax.add_patch(Circle(xy=(100*halo[0], 100*halo[1]), fc=color, ec='none', alpha=0.5, radius=500))
#ax.autoscale_view(tight=True)
if filename is not None:
fig.savefig(filename+'.pdf', format='pdf')#, bbox_inches='tight')
def test_calc_log_likelihood(df_test):
tol=10**(-5)
halo1 = np.array( [[4200, 4200, 1000, 200, 0]], dtype=np.float64)
assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo1) - 107.65312710886943) <= tol
halo2 = np.array( [[2400, 2400, 500, 100, 0]], dtype=np.float64)
assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo2) - 116.10013304100042) <= tol
halo3 = np.array( [[2400, 2400, 500, 100, 0.01]], dtype=np.float64)
assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo3) - 116.53971627108918) <= tol
halo4 = np.array( [[2400, 2400, 500, 100, 0.001]], dtype=np.float64)
assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo4) - 116.15631624187554) <= tol
halo5 = np.array( [[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001]], dtype=np.float64 )
assert np.abs(calc_log_likelihood(df_test[df_test.ID == 40][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo5) - -47.3726778566) <= tol
halo6 = np.array( [[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001]], dtype=np.float64 )
assert np.abs(calc_log_likelihood(df_test[df_test.ID == 99][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo6) - -218.106874888) <= tol
halo7 = np.array( [[2820.241455786800543, 2530.896772986271117, 95.463841074174709, 5.031270423604361, 0.],
[1239.840315636071637, 2429.487190770937559, 980.917139013906194, 52.64664806466692, 0.],
[108.199502973083582, 140.00882477450304 , 610.810091887177578, 107.024441568824273, 0.]])
assert np.abs(calc_log_likelihood(df_test[df_test.ID==99][['ID','x','y','e1','e2','NumberHalos']].values, halo7) - 91.1307046217) <= tol
@joblib_cache.cache
def run_init(X):
#X = df_test[ ['ID','x','y','e1','e2','NumberHalos'] ]
#X = df[ ['ID','x','y','e1','e2','numberHalos'] ]
best_halos, log_likelihoods = init(X.values)
return best_halos, log_likelihoods
@joblib_cache.cache
def run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.019):
predictions, samples = mcmc(X.values, best_halos, log_likelihoods, no_iterations, step_size)
return predictions, samples
def test(ID, no_iterations=20000, step_size=0.019):
X = df[df.ID==ID][['ID','x','y','e1','e2','numberHalos']]
best_halos, log_likelihoods = init(X.values)
predictions, samples = mcmc(X.values, best_halos, log_likelihoods, no_iterations, step_size)
return predictions, samples
df, df_test, train_halos = load_data()
.DS_Store -> no item named e1 .DS_Store -> no item named e1
test_calc_log_likelihood(df_test)
# "training set"
df.head()
GalaxyID | ID | SkyId | a | b | e1 | e1_norm | e2 | e2_norm | theta | x | y | numberHalos | x_ref | y_ref | halo_x1 | halo_y1 | halo_x2 | halo_y2 | halo_x3 | halo_y3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Galaxy1 | 0 | Sky1 | 1.440 | 0.766 | -0.051 | -0.111 | -0.301 | -1.366 | -49.765 | 700.87 | 89.98 | 1 | 1086.8 | 1114.61 | 1086.8 | 1114.61 | 0 | 0 | 0 | 0 |
1 | Galaxy2 | 0 | Sky1 | 1.341 | 0.797 | -0.249 | -1.005 | 0.052 | 0.369 | 84.096 | 570.74 | 3528.23 | 1 | 1086.8 | 1114.61 | 1086.8 | 1114.61 | 0 | 0 | 0 | 0 |
2 | Galaxy3 | 0 | Sky1 | 1.952 | 0.672 | 0.484 | 2.296 | 0.061 | 0.413 | 3.587 | 579.66 | 1381.65 | 1 | 1086.8 | 1114.61 | 1086.8 | 1114.61 | 0 | 0 | 0 | 0 |
3 | Galaxy4 | 0 | Sky1 | 1.270 | 0.825 | -0.043 | -0.079 | 0.208 | 1.135 | 50.905 | 1595.64 | 1226.96 | 1 | 1086.8 | 1114.61 | 1086.8 | 1114.61 | 0 | 0 | 0 | 0 |
4 | Galaxy5 | 0 | Sky1 | 1.300 | 0.812 | -0.102 | -0.343 | -0.207 | -0.904 | -58.101 | 444.24 | 1567.29 | 1 | 1086.8 | 1114.61 | 1086.8 | 1114.61 | 0 | 0 | 0 | 0 |
# "test set"
df_test.head()
GalaxyID | ID | SkyId | a | b | e1 | e1_norm | e2 | e2_norm | theta | x | y | NumberHalos | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Galaxy1 | 0 | Sky1 | 1.825 | 0.689 | -0.392 | -1.881 | 0.225 | 1.053 | 75.062 | 3225.83 | 1229.26 | 1 |
1 | Galaxy2 | 0 | Sky1 | 1.050 | 0.955 | -0.032 | -0.128 | -0.035 | -0.144 | -66.494 | 2381.05 | 2629.69 | 1 |
2 | Galaxy3 | 0 | Sky1 | 1.192 | 0.862 | 0.159 | 0.808 | 0.020 | 0.109 | 3.591 | 486.04 | 2901.77 | 1 |
3 | Galaxy4 | 0 | Sky1 | 1.593 | 0.729 | -0.332 | -1.590 | 0.168 | 0.792 | 76.559 | 938.82 | 2202.68 | 1 |
4 | Galaxy5 | 0 | Sky1 | 1.244 | 0.836 | 0.126 | 0.643 | 0.151 | 0.710 | 25.075 | 1746.20 | 2401.99 | 1 |
# x and y are both uniformly distributed
# with mean ~2100 and std=~1210
# see below for distribution plots
no_halos = 1
print df[df.numberHalos == no_halos].x.mean()
print df[df.numberHalos == no_halos].x.std()
print df[df.numberHalos == no_halos].y.mean()
print df[df.numberHalos == no_halos].y.std()
2109.85228474 1211.24752398 2107.21994394 1209.03071121
plt.figure(figsize=(7,7))
df['x'].hist(color='k', alpha=0.5, bins=50)
plt.savefig('x_distribution.pdf', format='pdf')
plt.figure(figsize=(7,7))
df['y'].hist(color='k', alpha=0.5, bins=50)
plt.savefig('y_distribution.pdf', format='pdf')
plt.figure(figsize=(10,10))
plt.scatter( df['x'], df['y'], s=1, cmap=mpl.cm.gray )
plt.savefig('x_vs_y_distribution.pdf', format='pdf')
# e1 and e2 are both N(0, 0.22**2)
# independent of the number of halos
# see below for distribution plots
no_halos = 1
print df[df.numberHalos == no_halos].e1.mean()
print df[df.numberHalos == no_halos].e1.std()
print df[df.numberHalos == no_halos].e2.mean()
print df[df.numberHalos == no_halos].e2.std()
0.00106709782568 0.221708498853 -0.000476517534257 0.219473684278
plt.figure(figsize=(7,7))
df['e1_norm'].hist(color='k', alpha=0.5, bins=100)
plt.savefig('e1_norm_distribution.pdf', format='pdf')
plt.figure(figsize=(7,7))
df['e2_norm'].hist(color='k', alpha=0.5, bins=100)
plt.savefig('e2_norm_distribution.pdf', format='pdf')
plt.figure(figsize=(7,7))
plt.scatter( df['e1'], df['e2'], s=1, cmap=mpl.cm.gray )
plt.savefig('e1_vs_e2_norm_distribution.pdf', format='pdf')
# 1 halo case
skyid='Sky99'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid])
[ 3294.2199999999998] [ 2646.389999999999873]
# 2 halo case - check out those galaxies!
skyid='Sky116'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid], filename='Sky116')
[ 3333.7800000000002] [ 3487.480000000000018] [ 1327.279999999999973] [ 4020.019999999999982]
# 3 halo case
skyid='Sky300'
draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid], filename='sky300')
[ 2502.949999999999818] [ 590.700000000000045] [ 3341.2800000000002] [ 3204.820000000000164] [ 773.75] [ 1877.490000000000009]
draw_heatmap(99, df, train_halos, r=4000, filename='heatmap_99')
(38, 9, 0.06801655884314046) 38.6867 , 9.7809 , 0.0680165588431
draw_heatmap(115, df, train_halos, r=4000, filename='heatmap116')
(33, 35, 0.10854675558956287) (16, 38, 0.020197036749682266) 33.3378 , 34.8748 , 0.102692526313 13.2728 , 40.2002 , 0.0123323601289
X = df[ ['ID','x','y','e1','e2','numberHalos'] ]
best_halos, log_likelihoods = run_init(X)
predictions, samples = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.019)
predictions2, samples2 = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.19)
predictions3, samples3 = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=1.9)
plt.scatter(samples[0][30000:].T[0], samples[0][5000:].T[1])
<matplotlib.collections.PathCollection at 0x10d845390>
burnin = 40000
results = []
for SkyID in xrange(300):
t = {}
t['SkyId'] = 'Sky'+str(SkyID+1)
for halo in [0,1,2]:
t['halo_x'+str(halo+1)+'_prediction'] = samples[SkyID][burnin:].T[0][halo].mean()
t['halo_y'+str(halo+1)+'_prediction'] = samples[SkyID][burnin:].T[1][halo].mean()
results.append(t)
predictions_df = pd.DataFrame(results)
predictions_df = predictions_df[['SkyId','halo_x1_prediction','halo_y1_prediction','halo_x2_prediction','halo_y2_prediction','halo_x3_prediction','halo_y3_prediction']]
final_results = pd.merge(train_halos, predictions_df, on='SkyId')
final_results_1_halo = final_results[final_results.numberHalos == 1]
final_results_1_halo['x1_diff'] = final_results_1_halo['halo_x1'] - final_results_1_halo['halo_x1_prediction']
final_results_1_halo['y1_diff'] = final_results_1_halo['halo_y1'] - final_results_1_halo['halo_y1_prediction']
print final_results_1_halo['x1_diff'].mean()
print final_results_1_halo['x1_diff'].std()
final_results_1_halo['x1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x1_1halo_error.pdf', format='pdf')
-9.11318578604 322.716605257
print final_results_1_halo['y1_diff'].mean()
print final_results_1_halo['y1_diff'].std()
final_results_1_halo['y1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y1_1halo_error.pdf', format='pdf')
-14.3968733588 216.444298644
# don't know the proper halo ordering, so will assume that the predicted
# halos are closer to the actual halos, :p
final_results_2_halo = final_results[final_results.numberHalos == 2]
final_results_2_halo['x1_diff'] = np.minimum(final_results_2_halo['halo_x1'] - final_results_2_halo['halo_x1_prediction'], final_results_2_halo['halo_x1'] - final_results_2_halo['halo_x2_prediction'])
final_results_2_halo['y1_diff'] = np.minimum(final_results_2_halo['halo_y1'] - final_results_2_halo['halo_y1_prediction'], final_results_2_halo['halo_y1'] - final_results_2_halo['halo_y2_prediction'])
final_results_2_halo['x2_diff'] = np.minimum(final_results_2_halo['halo_x2'] - final_results_2_halo['halo_x1_prediction'], final_results_2_halo['halo_x2'] - final_results_2_halo['halo_x2_prediction'])
final_results_2_halo['y2_diff'] = np.minimum(final_results_2_halo['halo_y2'] - final_results_2_halo['halo_y1_prediction'], final_results_2_halo['halo_y2'] - final_results_2_halo['halo_y2_prediction'])
final_results_2_halo.head()
SkyId | numberHalos | x_ref | y_ref | halo_x1 | halo_y1 | halo_x2 | halo_y2 | halo_x3 | halo_y3 | halo_x1_prediction | halo_y1_prediction | halo_x2_prediction | halo_y2_prediction | halo_x3_prediction | halo_y3_prediction | x1_diff | y1_diff | x2_diff | y2_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
100 | Sky101 | 2 | 3136.47 | 932.52 | 4093.96 | 345.65 | 1604.48 | 1871.51 | 0 | 0 | 3589.258 | 99.660 | 4118.122 | 109.834 | 0 | 0 | -24.162 | 235.816 | -2513.642 | 1761.676 |
101 | Sky102 | 2 | 266.16 | 721.01 | 19.77 | 790.82 | 2591.24 | 62.25 | 0 | 0 | 1830.988 | 515.728 | 52.285 | 672.802 | 0 | 0 | -1811.218 | 118.018 | 760.252 | -610.552 |
102 | Sky103 | 2 | 1797.30 | 2446.26 | 1775.04 | 2459.69 | 3644.59 | 1331.22 | 0 | 0 | 2831.102 | 3551.173 | 1824.692 | 2449.746 | 0 | 0 | -1056.062 | -1091.483 | 813.488 | -2219.953 |
103 | Sky104 | 2 | 506.86 | 2603.82 | 527.74 | 2497.54 | 332.85 | 3489.45 | 0 | 0 | 617.555 | 2524.441 | 3216.009 | 2226.939 | 0 | 0 | -2688.269 | -26.901 | -2883.159 | 965.009 |
104 | Sky105 | 2 | 1627.06 | 2360.39 | 1084.54 | 2839.79 | 3245.09 | 930.61 | 0 | 0 | 3830.695 | 3839.146 | 1494.022 | 551.490 | 0 | 0 | -2746.155 | -999.356 | -585.605 | -2908.536 |
print final_results_2_halo['x1_diff'].mean()
print final_results_2_halo['x1_diff'].std()
final_results_2_halo['x1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x1_2halo_error.pdf', format='pdf')
-789.624695689 1184.50692579
print final_results_2_halo['y1_diff'].mean()
print final_results_2_halo['y1_diff'].std()
final_results_2_halo['y1_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y1_1halo_error.pdf', format='pdf')
-717.37000767 1067.7849683
print final_results_2_halo['x2_diff'].mean()
print final_results_2_halo['x2_diff'].std()
final_results_2_halo['x2_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('x2_1halo_error.pdf', format='pdf')
-813.816195689 1364.53121299
print final_results_2_halo['y2_diff'].mean()
print final_results_2_halo['y2_diff'].std()
final_results_2_halo['y2_diff'].hist(color='k', alpha=0.5, bins=50);
plt.savefig('y2_1halo_error.pdf', format='pdf')
-780.52580767 1466.65282258