In [1]:
import numpy as np
import math as m
import matplotlib.pyplot as plt
%matplotlib inline

In [111]:
# set the box range
box_dim = 100

# create random pairs of coordinates in the box (x1, y1, x2, x2)
randoms = np.random.rand(10000, 4)*box_dim
randoms[0:10,]

Out[111]:
array([[ 10.67703215,  87.27675791,  11.56526361,  79.92198867],
[ 81.2283512 ,  52.24879191,  75.5042816 ,  55.0166546 ],
[ 32.70622064,  67.98464488,  27.73900492,  95.52083663],
[ 66.46936307,  77.90811741,  16.87401609,  24.82937002],
[ 11.60415264,  99.9630587 ,   4.45027578,  36.35778503],
[ 15.53375625,  17.49650387,   4.81442386,  38.56435318],
[ 33.14291848,  41.30859325,  28.09719459,  11.53435108],
[ 45.74869111,  58.96177641,  67.22882704,  57.9946143 ],
[ 65.06045955,  24.43825205,  60.75214046,  72.82288794],
[ 49.19464056,  74.05560444,  10.22045352,  11.78019158]])
In [116]:
# calculute the true line lengths - vectorised pythag
line_lengths = np.sqrt(np.square(randoms[:,2]-randoms[:,0])+np.square(randoms[:,3]-randoms[:,1]))

#plot the true line length distribution
plt.hist(line_lengths, bins = np.arange(1, 150, 1))
plt.show()

In [160]:
# make a new copy of the coordinates
randoms_truncated = randoms.copy()

x_truncation = 1.0
y_truncation = 11.0

# truncate the x's to nearest 1m, and the y's to the nearest 11m
randoms_truncated[:,0] = np.around(randoms[:,0]/x_truncation,0)*x_truncation
randoms_truncated[:,2] = np.around(randoms[:,2]/x_truncation,0)*x_truncation
randoms_truncated[:,1] = np.around(randoms[:,1]/y_truncation,0)*y_truncation
randoms_truncated[:,3] = np.around(randoms[:,3]/y_truncation,0)*y_truncation

In [161]:
# recalc the new line lengths
truncated_line_lengths = np.sqrt(np.square(randoms_truncated[:,2]-randoms_truncated[:,0])+np.square(randoms_truncated[:,3]-randoms_truncated[:,1]))

#plot as before
plt.hist(truncated_line_lengths, bins = np.arange(1, 150, 1))
plt.show()

In [220]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(20,10))
ax1.set_title('Original')
ax2.set_title('Truncated')

ax1.hist(line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
ax2.hist(truncated_line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
plt.show()

In [147]:
residuals = randoms - randoms_truncated

In [157]:
# take a look at the residuals which we cut off when we truncated. These are flat distributions...

f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=True, figsize=(20,10))
ax1.set_title('Residual Y1')
ax2.set_title('Residual Y2')
ax3.set_title('Residual X1')
ax4.set_title('Residual X2')

ax1.hist(residuals[:,1], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax2.hist(residuals[:,3], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax3.hist(residuals[:,0], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax4.hist(residuals[:,2], bins = np.arange(-6, 6, 0.1), alpha=0.5)

plt.show()

In [168]:
# ok, so can we fix the broken distribution

num_rows = len(randoms[:,1])

simulated_residuals = np.random.rand(10000, 4)

simulated_residuals[:,0] = (simulated_residuals[:,0] * x_truncation) - (x_truncation/2)
simulated_residuals[:,2] = (simulated_residuals[:,2] * x_truncation) - (x_truncation/2)
simulated_residuals[:,1] = (simulated_residuals[:,1] * y_truncation) - (y_truncation/2)
simulated_residuals[:,3] = (simulated_residuals[:,3] * y_truncation) - (y_truncation/2)

In [169]:
# take a look at our simulated residuals and check they look like the real resilduals

f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=True, figsize=(20,10))
ax1.set_title('Random Residual Y1')
ax2.set_title('Random Residual Y2')
ax3.set_title('Random Residual X1')
ax4.set_title('Random Residual X2')

ax1.hist(simulated_residuals[:,1], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax2.hist(simulated_residuals[:,3], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax3.hist(simulated_residuals[:,0], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax4.hist(simulated_residuals[:,2], bins = np.arange(-6, 6, 0.1), alpha=0.5)

plt.show()

In [174]:
# add the simulated residuals and take another look at the lengths histogram!

fixed_data = randoms_truncated + simulated_residuals

# recalc the new line lengths
fixed_line_lengths = np.sqrt(np.square(fixed_data[:,2]-fixed_data[:,0])+np.square(fixed_data[:,3]-fixed_data[:,1]))

#plot as before
plt.hist(fixed_line_lengths, bins = np.arange(1, 150, 1))
plt.show()

In [218]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(20,10))
ax1.set_title('Original')
ax2.set_title('Truncated')
ax3.set_title('Fixed')

ax1.hist(line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
ax2.hist(truncated_line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
ax3.hist(fixed_line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
plt.show()

In [230]:
from scipy.optimize import curve_fit

In [231]:
# Define model function to be used to fit to the data above:
def gauss(x, *p):
A, mu, sigma = p
return A*np.exp(-(x-mu)**2/(2.*sigma**2))

# p0 is the initial guess for the fitting coefficients (A, mu and sigma above)
p0 = [1., 50., 25.]

In [232]:
bins = np.arange(0, 150, 5)
bin_centres = (bins[:-1] + bins[1:])/2

In [233]:
# create histograms for fitting and fit curves
hist_original, bin_edges = np.histogram(line_lengths, bins=bins, density=True)
hist_truncated, bin_edges = np.histogram(truncated_line_lengths, bins=bins, density=True)
hist_fixed, bin_edges = np.histogram(fixed_line_lengths, bins=bins, density=True)

coeff_original, var_matrix = curve_fit(gauss, bin_centres, hist_original, p0=p0)
coeff_truncated, var_matrix = curve_fit(gauss, bin_centres, hist_truncated, p0=p0)
coeff_fixed, var_matrix = curve_fit(gauss, bin_centres, hist_fixed, p0=p0)

In [234]:
# Create histograms of fitted curves
hist_original_fit = gauss(bin_centres, *coeff_original)
hist_truncated_fit = gauss(bin_centres, *coeff_truncated)
hist_fixed_fit = gauss(bin_centres, *coeff_fixed)

In [235]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(20,5))
ax1.set_title('Original')
ax2.set_title('Truncated')
ax3.set_title('Fixed')

ax1.plot(bin_centres, hist_original, color='r',label='Test data')
ax1.plot(bin_centres, hist_original_fit, color='b', label='Fitted data')

ax2.plot(bin_centres, hist_truncated, color='r',label='Test data')
ax2.plot(bin_centres, hist_truncated_fit, color='b', label='Fitted data')

ax3.plot(bin_centres, hist_fixed, color='r',label='Test data')
ax3.plot(bin_centres, hist_fixed_fit, color='b', label='Fitted data')

plt.show()

print 'Original - Mean: {}, SD: {}'.format(coeff_original[1], coeff_original[2])
print 'Truncated - Mean: {}, SD: {}'.format(coeff_truncated[1], coeff_truncated[2])
print 'Fixed - Mean: {}, SD: {}'.format(coeff_fixed[1], coeff_fixed[2])

Original - Mean: 48.9664426296, SD: 29.1530505025
Truncated - Mean: 49.3492839645, SD: 29.4436943482
Fixed - Mean: 49.2043883025, SD: 29.328475245