Vishal Goklani twitter.com/vgoklani Solutions from the Kaggle Dark Worlds competition from IPython.display import display %load_ext retina %load_ext autosave %autosave 90 %pylab inline import os, sys, re, json, itertools, operator import pandas as pd, numpy as np, scipy as sp from matplotlib.patches import Ellipse, Circle from joblib import Memory %load_ext cythonmagic data_directory = os.path.abspath(os.path.join(os.path.expanduser("~"), 'build', 'kaggle', 'DarkWorlds')) output_directory = os.path.join(data_directory, 'output') if not os.path.exists(output_directory): os.makedirs(output_directory) joblib_cache = Memory(cachedir=os.path.join(output_directory, 'joblib'), verbose=0) pd.set_printoptions(notebook_repr_html=True, precision=4, max_columns=200, max_rows=50, max_colwidth=25, column_space=10) np.set_printoptions(linewidth=200, precision=15, suppress=True) %%cython import numpy as np cimport numpy as np #from libc.math cimport sqrt cimport cython ctypedef np.float64_t dtype_t @cython.boundscheck(False) @cython.wraparound(False) cpdef calc_distance(np.ndarray[dtype_t, ndim=1] x0, np.ndarray[dtype_t, ndim=1] y0, np.ndarray[dtype_t, ndim=1] x1, np.ndarray[dtype_t, ndim=1] y1, dtype_t p=2.0): return ( (x0-x1)**p + (y0-y1)**p )**(1.0/p) @cython.boundscheck(False) @cython.wraparound(False) cpdef check_distance(np.ndarray[dtype_t, ndim=1] x0, np.ndarray[dtype_t, ndim=1] y0, dtype_t x1, dtype_t y1, dtype_t r): return ( ( (x0-x1)**2 + (y0-y1)**2 )**(0.5) <= r ) @cython.boundscheck(False) @cython.wraparound(False) cpdef calc_signal(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, np.ndarray[dtype_t, ndim=1] x_halo, np.ndarray[dtype_t, ndim=1] y_halo): if x_halo.all() == 0.0 and y_halo.all() == 0.0: return 0 cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) ) return (-( e1*np.cos(2*phi) + e2*np.sin(2*phi) )) @cython.boundscheck(False) @cython.wraparound(False) cpdef calc_signal_point(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, dtype_t x_halo, dtype_t y_halo): cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) ) return (-( e1*np.cos(2*phi) + e2*np.sin(2*phi) )).mean() @cython.boundscheck(False) @cython.wraparound(False) cpdef calc_signal_intensity(np.ndarray[dtype_t, ndim=1] x, np.ndarray[dtype_t, ndim=1] y, np.ndarray[dtype_t, ndim=1] e1, np.ndarray[dtype_t, ndim=1] e2, dtype_t x_halo, dtype_t y_halo, dtype_t p=2): cdef np.ndarray[dtype_t, ndim=1] phi = np.arctan2( (y - y_halo), (x - x_halo) ) cdef np.ndarray[dtype_t, ndim=1] signal = -( e1*np.cos(2*phi) + e2*np.sin(2*phi)) cdef np.ndarray[dtype_t, ndim=1] distance = ( (x-x_halo)**2 + (y-y_halo)**2 )**(0.5) return (signal/(distance**0)).mean() # @todo: rewrite this abomination... @cython.boundscheck(False) @cython.wraparound(False) cpdef get_max(np.ndarray[dtype_t, ndim=2] X, dtype_t r1=15, Py_ssize_t max_no_galaxies = 3, dtype_t r2=15): cdef: dtype_t curr_max1 = 0.0, curr_max2 = 0.0, curr_max3 = 0.0 Py_ssize_t i, i_max1, i_max2, i_max3 Py_ssize_t j, j_max1, j_max2, j_max3 for i in xrange(X.shape[0]): for j in xrange(X.shape[1]): if X[i,j] > curr_max1: curr_max1 = X[i,j] i_max1 = i j_max1 = j elif max_no_galaxies > 1 and X[i,j] > curr_max2 and (((i_max1 - i)**2 + (j_max1 - j)**2) > r1**2): curr_max2 = X[i,j] i_max2 = i j_max2 = j elif max_no_galaxies > 2 and X[i,j] > curr_max3 and (((i_max1 - i)**2 + (j_max1 - j)**2) > r1**2) and (((i_max2 - i)**2 + (j_max2 - j)**2) > r2**2): curr_max3 = X[i,j] i_max3 = i j_max3 = j return [(i_max1, j_max1, curr_max1), (i_max2, j_max2, curr_max2), (i_max3, j_max3, curr_max3)][:max_no_galaxies] @cython.boundscheck(False) @cython.wraparound(False) cpdef calc_strength_(np.ndarray[dtype_t, ndim=2] M, dtype_t r=5000): cdef: np.ndarray[dtype_t, ndim=3] s = np.zeros((300, 42, 42), dtype=np.float64) np.ndarray[dtype_t, ndim=2] w Py_ssize_t x0,y0, skyid for skyid in xrange(300): X = M[ M[:,0] == skyid ] for x0 in xrange(0,4200,100): for y0 in xrange(0,4200,100): w = X[check_distance( X[:,1], X[:,2], x0, y0, r)] if w.shape[0] > 0: s[skyid, int(x0/100.0), int(y0/100.0)] = calc_signal_intensity(w[:,1], w[:,2], w[:,3], w[:,4], x0, y0) return s.reshape( (300, -1) ) # (# of samples, # of features) %%cython import numpy as np cimport numpy as np from libc.math cimport sqrt cimport cython ctypedef np.float64_t dtype_t @cython.boundscheck(False) @cython.wraparound(False) cpdef generate_halos(np.int number_of_halos=1): cdef: np.ndarray[dtype_t, ndim=2] halos = np.zeros( (3, 5), dtype=np.float64) for i in xrange(number_of_halos): halos[i, 0] = np.random.random_sample() * 4200.0 # x halos[i, 1] = np.random.random_sample() * 4200.0 # y halos[i, 2] = np.random.random_sample() * 1000.0 # r0 halos[i, 3] = np.random.random_sample() * 200.0 # m halos[i, 4] = np.random.random_sample() * 0.0 # std return halos @cython.boundscheck(False) @cython.wraparound(False) cpdef init(np.ndarray[dtype_t, ndim=2] data): cdef: np.int trial, no_halos np.ndarray[dtype_t, ndim=3] best_halos = np.zeros((300, 3, 5), dtype=np.float64) np.ndarray[dtype_t, ndim=2] log_likelihoods = np.zeros( (300, 2), dtype=np.float64) np.ndarray[dtype_t, ndim=2] X, halos dtype_t log_likelihood Py_ssize_t skyid for skyid in xrange(data[:,0].min(), data[:,0].max()+1): X = data[ data[:,0] == skyid ] # fancy indexing -> calls copy constructor :( log_likelihoods[skyid, 0] = -np.float('inf') # initialize log_likelihood to negative infinity log_likelihoods[skyid, 1] = 0 # initialize # of acceptances no_halos = np.int(X[skyid, 5]) for trial in xrange(5000): halos = generate_halos( no_halos ) log_likelihood = calc_log_likelihood(X, halos) if log_likelihood > log_likelihoods[skyid, 0]: log_likelihoods[skyid, 1] += 1 best_halos[skyid] = halos log_likelihoods[skyid, 0] = log_likelihood #print 'finished initializing sky#%s -> %s accepted with loglikelihood=%s' % (str(skyid+1), str(log_likelihoods[skyid, 1]), str(log_likelihoods[skyid, 0])) return best_halos, log_likelihoods @cython.boundscheck(False) @cython.wraparound(False) cpdef calc_log_likelihood(np.ndarray[dtype_t, ndim=2] X, np.ndarray[dtype_t, ndim=2] halos): cdef: dtype_t halo_predicted_x, halo_predicted_y, halo_predicted_r_cutoff, halo_predicted_r_mass, halo_predicted_std, e_std np.ndarray[dtype_t, ndim=1] predicted_var, e1_force, e2_force, phi, dist_halo1, r_effective, force, Py_ssize_t i, j, k, N np.int number_halos = np.int(X[0,5]) e_std = 0.22 e1_force = np.zeros_like(X[:,1]) e2_force = np.zeros_like(X[:,1]) # model parameter constraints -> shamelessly stolen from Iain Murray if (np.any(halos[:,0] < 0) or np.any(halos[:,0] > 4200) or np.any(halos[:,1] < 0) or np.any(halos[:,1] > 4200) or np.any(halos[:,2] < 0) or np.any(halos[:,2] > 1000) or np.any(halos[:,3] < 0) or np.any(halos[:,3] > 300) or np.any(halos[:,4] < 0) or np.any(halos[:,4] > 0.3)): return -np.float('inf') for i in xrange(number_halos): halo = halos[i,:] halo_predicted_x = halo[0] halo_predicted_y = halo[1] halo_predicted_r_cutoff = halo[2] halo_predicted_mass = halo[3] halo_predicted_std = e_std + halo[4] predicted_var = (e_std * e_std) * np.ones_like(X[:,1]) phi = np.arctan( (X[:,2] - halo_predicted_y) / (X[:,1] - halo_predicted_x) ) dist_halo1 = ( (X[:,1] - halo_predicted_x)**2 + (X[:,2] - halo_predicted_y)**2 )**(0.5) r_effective = np.maximum(halo_predicted_r_cutoff, dist_halo1) force = halo_predicted_mass/(r_effective*1.0) e1_force += -force*np.cos(2*phi) e2_force += -force*np.sin(2*phi) predicted_var[dist_halo1 < halo_predicted_r_cutoff] = np.maximum( predicted_var[dist_halo1 < halo_predicted_r_cutoff], halo_predicted_std * halo_predicted_std) log_likelihood = np.sum( -0.5 * ( ((X[:,3]-e1_force)**2 + (X[:,4]-e2_force)**2 ) / (predicted_var)) - np.log(2*np.pi*predicted_var) ) return log_likelihood @cython.boundscheck(False) @cython.wraparound(False) cpdef mcmc(np.ndarray[dtype_t, ndim=2] data, np.ndarray[dtype_t, ndim=3] best_halos, np.ndarray[dtype_t, ndim=2] log_likelihoods, np.int no_iterations, dtype_t stepsize): cdef: np.ndarray[dtype_t, ndim=2] proposed_halos = np.empty_like(best_halos[0]) np.ndarray[dtype_t, ndim=4] samples = np.zeros( (300, no_iterations, 3, 5), dtype=np.float64 ) np.ndarray[dtype_t, ndim=2] X np.ndarray[dtype_t, ndim=2] perturbation = np.zeros( (3,5), dtype=np.float64) dtype_t log_likelihood_proposed np.int number_halos, iteration Py_ssize_t skyid for skyid in xrange(data[:,0].min(), data[:,0].max()+1): X = data[ data[:,0] == skyid ] number_halos = np.int(X[0,5]) log_likelihoods[skyid, 1] = 0 for iteration in xrange(no_iterations): # stepsize should be different for each variable type! perturbation[:number_halos] = stepsize*np.random.randn(number_halos,5) proposed_halos = perturbation + best_halos[skyid] log_likelihood_proposed = calc_log_likelihood(X, proposed_halos) if np.log( np.random.random_sample() ) < (log_likelihood_proposed - log_likelihoods[skyid, 0]): log_likelihoods[skyid, 1] += 1 log_likelihoods[skyid, 0] = log_likelihood_proposed best_halos[skyid] = proposed_halos samples[skyid][iteration] = proposed_halos #print 'finished processing skyid%s' % str(skyid+1) #print '\tacceptance ratio = %s / %s\tlog_likelihood = %s' % (str(log_likelihoods[skyid, 1]), str(no_iterations), str(log_likelihoods[skyid, 0]) ) return best_halos, samples @joblib_cache.cache def load_data(): train_halos = pd.read_csv( os.path.join(data_directory, 'Training_halos.csv') ) test_halo_counts = pd.read_csv( os.path.join(data_directory, 'Test_haloCounts.csv') ) train_skies = pd.DataFrame() for s in os.listdir( os.path.join(data_directory, 'Train_Skies') ): try: d = pd.read_csv( os.path.join(data_directory, 'Train_Skies', s) ) d['SkyId'] = s.split("_")[1].split(".")[0] d['ID'] = d['SkyId'].map(lambda x: int( str(x).split('y')[1]) )-1 d['a'] = 1/(1-np.sqrt(d['e1']**2 + d['e2']**2)) d['b'] = 1/(1+np.sqrt(d['e1']**2 + d['e2']**2)) d['theta'] = np.degrees(np.arctan2(d['e2'], d['e1'])*0.5) d['e1_norm'] = (d.e1-d.e1.mean()) / d.e1.std() d['e2_norm'] = (d.e2-d.e2.mean()) / d.e2.std() train_skies = train_skies.append(d) except Exception as e: print s,' -> ', e.message test_skies = pd.DataFrame() for s in os.listdir( os.path.join(data_directory, 'Test_Skies') ): try: d = pd.read_csv( os.path.join(data_directory, 'Test_Skies', s) ) d['SkyId'] = s.split("_")[1].split(".")[0] d['ID'] = d['SkyId'].map(lambda x: int( str(x).split('y')[1]) )-1 d['a'] = 1/(1-np.sqrt(d['e1']**2 + d['e2']**2)) d['b'] = 1/(1+np.sqrt(d['e1']**2 + d['e2']**2)) d['theta'] = np.degrees(np.arctan2(d['e2'], d['e1'])*0.5) d['e1_norm'] = (d.e1-d.e1.mean()) / d.e1.std() d['e2_norm'] = (d.e2-d.e2.mean()) / d.e2.std() test_skies = test_skies.append(d) except Exception as e: print s,' -> ', e.message df = pd.merge(train_skies, train_halos, on='SkyId') df_test = pd.merge(test_skies, test_halo_counts, on='SkyId') return df, df_test, train_halos @joblib_cache.cache def calc_strength(X, r): return calc_strength_(X,r) def draw_sky(X, train_halos, calculated_halos = None, size_multiplier=25, filename=None): fig = plt.figure(figsize=(10,10)) ax = fig.add_subplot(111, aspect='equal') for row in xrange(X.index.min().values, X.index.max().values): try: x,y,a,b,theta = X.ix[row][ ['x','y','a','b','theta'] ].values galaxy = Ellipse(xy=(x, y), width=size_multiplier*a, height=size_multiplier*b, angle=theta, fc='none') ax.add_patch(galaxy) except: print row for i in range(1,1+train_halos['numberHalos'].values): x,y = train_halos['halo_x'+str(i)].values, train_halos['halo_y'+str(i)].values print x,y ax.add_patch(Circle(xy=(x, y), fc='r', ec='none', alpha=0.5, radius=3*size_multiplier)) ax.autoscale_view(tight=True) plt.show() if filename is not None: fig.savefig(filename+'.pdf', format='pdf')#, bbox_inches='tight') # the heatmap below shows the strength of the signal intensity # due to the presence of a halo. def draw_heatmap(ID, df, train_halos, r=2000, filename=None): skyid='Sky'+str(ID+1) X = df[ ['ID','x','y','e1','e2','a','b','theta'] ].values strength = calc_strength(X, r=r) #print strength[np.isnan(strength)].shape s = strength.reshape( (300,42,42) ) fig = plt.figure(figsize=(14,14)) imgplot = imshow(asarray(s[ID].T), origin='lower') number_of_halos = train_halos[ train_halos.SkyId == skyid]['numberHalos'].values[0] imgplot.set_cmap('spectral') plt.colorbar() colors = itertools.cycle('rgbcmykw') for color, halo in zip(colors, get_max(s[ID], max_no_galaxies=number_of_halos)): print halo scatter(halo[0], halo[1], marker='o', s = 500, c=color, linewidths=2) th = train_halos[train_halos.SkyId == 'Sky'+str(ID+1)] for i in range(1,1+th['numberHalos'].values): x,y = th['halo_x'+str(i)].values, th['halo_y'+str(i)].values scatter(x/100, y/100, marker='x', s = 500, c='Y', linewidths=2) print x[0]/100,',',y[0]/100,',', s[ID][int(x/100)][int(y/100)] #ax.add_patch(Circle(xy=(x, y), fc='y', ec='none', alpha=0.5, radius=2*size_multiplier)) #ax.add_patch(Circle(xy=(100*halo[0], 100*halo[1]), fc=color, ec='none', alpha=0.5, radius=500)) #ax.autoscale_view(tight=True) if filename is not None: fig.savefig(filename+'.pdf', format='pdf')#, bbox_inches='tight') def test_calc_log_likelihood(df_test): tol=10**(-5) halo1 = np.array( [[4200, 4200, 1000, 200, 0]], dtype=np.float64) assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo1) - 107.65312710886943) <= tol halo2 = np.array( [[2400, 2400, 500, 100, 0]], dtype=np.float64) assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo2) - 116.10013304100042) <= tol halo3 = np.array( [[2400, 2400, 500, 100, 0.01]], dtype=np.float64) assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo3) - 116.53971627108918) <= tol halo4 = np.array( [[2400, 2400, 500, 100, 0.001]], dtype=np.float64) assert np.abs(calc_log_likelihood(df_test[df_test.ID == 0][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo4) - 116.15631624187554) <= tol halo5 = np.array( [[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001]], dtype=np.float64 ) assert np.abs(calc_log_likelihood(df_test[df_test.ID == 40][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo5) - -47.3726778566) <= tol halo6 = np.array( [[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001],[2400, 2400, 500, 100, 0.001]], dtype=np.float64 ) assert np.abs(calc_log_likelihood(df_test[df_test.ID == 99][ ['ID','x','y','e1','e2','NumberHalos'] ].values, halo6) - -218.106874888) <= tol halo7 = np.array( [[2820.241455786800543, 2530.896772986271117, 95.463841074174709, 5.031270423604361, 0.], [1239.840315636071637, 2429.487190770937559, 980.917139013906194, 52.64664806466692, 0.], [108.199502973083582, 140.00882477450304 , 610.810091887177578, 107.024441568824273, 0.]]) assert np.abs(calc_log_likelihood(df_test[df_test.ID==99][['ID','x','y','e1','e2','NumberHalos']].values, halo7) - 91.1307046217) <= tol @joblib_cache.cache def run_init(X): #X = df_test[ ['ID','x','y','e1','e2','NumberHalos'] ] #X = df[ ['ID','x','y','e1','e2','numberHalos'] ] best_halos, log_likelihoods = init(X.values) return best_halos, log_likelihoods @joblib_cache.cache def run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.019): predictions, samples = mcmc(X.values, best_halos, log_likelihoods, no_iterations, step_size) return predictions, samples def test(ID, no_iterations=20000, step_size=0.019): X = df[df.ID==ID][['ID','x','y','e1','e2','numberHalos']] best_halos, log_likelihoods = init(X.values) predictions, samples = mcmc(X.values, best_halos, log_likelihoods, no_iterations, step_size) return predictions, samples df, df_test, train_halos = load_data() test_calc_log_likelihood(df_test) # "training set" df.head() # "test set" df_test.head() # x and y are both uniformly distributed # with mean ~2100 and std=~1210 # see below for distribution plots no_halos = 1 print df[df.numberHalos == no_halos].x.mean() print df[df.numberHalos == no_halos].x.std() print df[df.numberHalos == no_halos].y.mean() print df[df.numberHalos == no_halos].y.std() plt.figure(figsize=(7,7)) df['x'].hist(color='k', alpha=0.5, bins=50) plt.savefig('x_distribution.pdf', format='pdf') plt.figure(figsize=(7,7)) df['y'].hist(color='k', alpha=0.5, bins=50) plt.savefig('y_distribution.pdf', format='pdf') plt.figure(figsize=(10,10)) plt.scatter( df['x'], df['y'], s=1, cmap=mpl.cm.gray ) plt.savefig('x_vs_y_distribution.pdf', format='pdf') # e1 and e2 are both N(0, 0.22**2) # independent of the number of halos # see below for distribution plots no_halos = 1 print df[df.numberHalos == no_halos].e1.mean() print df[df.numberHalos == no_halos].e1.std() print df[df.numberHalos == no_halos].e2.mean() print df[df.numberHalos == no_halos].e2.std() plt.figure(figsize=(7,7)) df['e1_norm'].hist(color='k', alpha=0.5, bins=100) plt.savefig('e1_norm_distribution.pdf', format='pdf') plt.figure(figsize=(7,7)) df['e2_norm'].hist(color='k', alpha=0.5, bins=100) plt.savefig('e2_norm_distribution.pdf', format='pdf') plt.figure(figsize=(7,7)) plt.scatter( df['e1'], df['e2'], s=1, cmap=mpl.cm.gray ) plt.savefig('e1_vs_e2_norm_distribution.pdf', format='pdf') # 1 halo case skyid='Sky99' draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid]) # 2 halo case - check out those galaxies! skyid='Sky116' draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid], filename='Sky116') # 3 halo case skyid='Sky300' draw_sky(df[ df['SkyId'] == skyid], train_halos[ train_halos['SkyId'] == skyid], filename='sky300') draw_heatmap(99, df, train_halos, r=4000, filename='heatmap_99') draw_heatmap(115, df, train_halos, r=4000, filename='heatmap116') X = df[ ['ID','x','y','e1','e2','numberHalos'] ] best_halos, log_likelihoods = run_init(X) predictions, samples = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.019) predictions2, samples2 = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=0.19) predictions3, samples3 = run_mcmc(X, best_halos, log_likelihoods, no_iterations=50000, step_size=1.9) plt.scatter(samples[0][30000:].T[0], samples[0][5000:].T[1]) burnin = 40000 results = [] for SkyID in xrange(300): t = {} t['SkyId'] = 'Sky'+str(SkyID+1) for halo in [0,1,2]: t['halo_x'+str(halo+1)+'_prediction'] = samples[SkyID][burnin:].T[0][halo].mean() t['halo_y'+str(halo+1)+'_prediction'] = samples[SkyID][burnin:].T[1][halo].mean() results.append(t) predictions_df = pd.DataFrame(results) predictions_df = predictions_df[['SkyId','halo_x1_prediction','halo_y1_prediction','halo_x2_prediction','halo_y2_prediction','halo_x3_prediction','halo_y3_prediction']] final_results = pd.merge(train_halos, predictions_df, on='SkyId') final_results_1_halo = final_results[final_results.numberHalos == 1] final_results_1_halo['x1_diff'] = final_results_1_halo['halo_x1'] - final_results_1_halo['halo_x1_prediction'] final_results_1_halo['y1_diff'] = final_results_1_halo['halo_y1'] - final_results_1_halo['halo_y1_prediction'] print final_results_1_halo['x1_diff'].mean() print final_results_1_halo['x1_diff'].std() final_results_1_halo['x1_diff'].hist(color='k', alpha=0.5, bins=50); plt.savefig('x1_1halo_error.pdf', format='pdf') print final_results_1_halo['y1_diff'].mean() print final_results_1_halo['y1_diff'].std() final_results_1_halo['y1_diff'].hist(color='k', alpha=0.5, bins=50); plt.savefig('y1_1halo_error.pdf', format='pdf') # don't know the proper halo ordering, so will assume that the predicted # halos are closer to the actual halos, :p final_results_2_halo = final_results[final_results.numberHalos == 2] final_results_2_halo['x1_diff'] = np.minimum(final_results_2_halo['halo_x1'] - final_results_2_halo['halo_x1_prediction'], final_results_2_halo['halo_x1'] - final_results_2_halo['halo_x2_prediction']) final_results_2_halo['y1_diff'] = np.minimum(final_results_2_halo['halo_y1'] - final_results_2_halo['halo_y1_prediction'], final_results_2_halo['halo_y1'] - final_results_2_halo['halo_y2_prediction']) final_results_2_halo['x2_diff'] = np.minimum(final_results_2_halo['halo_x2'] - final_results_2_halo['halo_x1_prediction'], final_results_2_halo['halo_x2'] - final_results_2_halo['halo_x2_prediction']) final_results_2_halo['y2_diff'] = np.minimum(final_results_2_halo['halo_y2'] - final_results_2_halo['halo_y1_prediction'], final_results_2_halo['halo_y2'] - final_results_2_halo['halo_y2_prediction']) final_results_2_halo.head() print final_results_2_halo['x1_diff'].mean() print final_results_2_halo['x1_diff'].std() final_results_2_halo['x1_diff'].hist(color='k', alpha=0.5, bins=50); plt.savefig('x1_2halo_error.pdf', format='pdf') print final_results_2_halo['y1_diff'].mean() print final_results_2_halo['y1_diff'].std() final_results_2_halo['y1_diff'].hist(color='k', alpha=0.5, bins=50); plt.savefig('y1_1halo_error.pdf', format='pdf') print final_results_2_halo['x2_diff'].mean() print final_results_2_halo['x2_diff'].std() final_results_2_halo['x2_diff'].hist(color='k', alpha=0.5, bins=50); plt.savefig('x2_1halo_error.pdf', format='pdf') print final_results_2_halo['y2_diff'].mean() print final_results_2_halo['y2_diff'].std() final_results_2_halo['y2_diff'].hist(color='k', alpha=0.5, bins=50); plt.savefig('y2_1halo_error.pdf', format='pdf') Clearly these solutions need to somehow be optimized...