import numpy as np
import math as m
import matplotlib.pyplot as plt
%matplotlib inline
# set the box range
box_dim = 100
# create random pairs of coordinates in the box (x1, y1, x2, x2)
randoms = np.random.rand(10000, 4)*box_dim
randoms[0:10,]
# calculute the true line lengths - vectorised pythag
line_lengths = np.sqrt(np.square(randoms[:,2]-randoms[:,0])+np.square(randoms[:,3]-randoms[:,1]))
#plot the true line length distribution
plt.hist(line_lengths, bins = np.arange(1, 150, 1))
plt.show()
# make a new copy of the coordinates
randoms_truncated = randoms.copy()
x_truncation = 1.0
y_truncation = 11.0
# truncate the x's to nearest 1m, and the y's to the nearest 11m
randoms_truncated[:,0] = np.around(randoms[:,0]/x_truncation,0)*x_truncation
randoms_truncated[:,2] = np.around(randoms[:,2]/x_truncation,0)*x_truncation
randoms_truncated[:,1] = np.around(randoms[:,1]/y_truncation,0)*y_truncation
randoms_truncated[:,3] = np.around(randoms[:,3]/y_truncation,0)*y_truncation
# recalc the new line lengths
truncated_line_lengths = np.sqrt(np.square(randoms_truncated[:,2]-randoms_truncated[:,0])+np.square(randoms_truncated[:,3]-randoms_truncated[:,1]))
#plot as before
plt.hist(truncated_line_lengths, bins = np.arange(1, 150, 1))
plt.show()
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(20,10))
ax1.set_title('Original')
ax2.set_title('Truncated')
ax1.hist(line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
ax2.hist(truncated_line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
plt.show()
residuals = randoms - randoms_truncated
# take a look at the residuals which we cut off when we truncated. These are flat distributions...
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=True, figsize=(20,10))
ax1.set_title('Residual Y1')
ax2.set_title('Residual Y2')
ax3.set_title('Residual X1')
ax4.set_title('Residual X2')
ax1.hist(residuals[:,1], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax2.hist(residuals[:,3], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax3.hist(residuals[:,0], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax4.hist(residuals[:,2], bins = np.arange(-6, 6, 0.1), alpha=0.5)
plt.show()
# ok, so can we fix the broken distribution
num_rows = len(randoms[:,1])
simulated_residuals = np.random.rand(10000, 4)
simulated_residuals[:,0] = (simulated_residuals[:,0] * x_truncation) - (x_truncation/2)
simulated_residuals[:,2] = (simulated_residuals[:,2] * x_truncation) - (x_truncation/2)
simulated_residuals[:,1] = (simulated_residuals[:,1] * y_truncation) - (y_truncation/2)
simulated_residuals[:,3] = (simulated_residuals[:,3] * y_truncation) - (y_truncation/2)
# take a look at our simulated residuals and check they look like the real resilduals
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=True, figsize=(20,10))
ax1.set_title('Random Residual Y1')
ax2.set_title('Random Residual Y2')
ax3.set_title('Random Residual X1')
ax4.set_title('Random Residual X2')
ax1.hist(simulated_residuals[:,1], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax2.hist(simulated_residuals[:,3], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax3.hist(simulated_residuals[:,0], bins = np.arange(-6, 6, 0.1), alpha=0.5)
ax4.hist(simulated_residuals[:,2], bins = np.arange(-6, 6, 0.1), alpha=0.5)
plt.show()
# add the simulated residuals and take another look at the lengths histogram!
fixed_data = randoms_truncated + simulated_residuals
# recalc the new line lengths
fixed_line_lengths = np.sqrt(np.square(fixed_data[:,2]-fixed_data[:,0])+np.square(fixed_data[:,3]-fixed_data[:,1]))
#plot as before
plt.hist(fixed_line_lengths, bins = np.arange(1, 150, 1))
plt.show()
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(20,10))
ax1.set_title('Original')
ax2.set_title('Truncated')
ax3.set_title('Fixed')
ax1.hist(line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
ax2.hist(truncated_line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
ax3.hist(fixed_line_lengths, bins = np.arange(1, 150, 1), alpha=0.5)
plt.show()
from scipy.optimize import curve_fit
# Define model function to be used to fit to the data above:
def gauss(x, *p):
A, mu, sigma = p
return A*np.exp(-(x-mu)**2/(2.*sigma**2))
# p0 is the initial guess for the fitting coefficients (A, mu and sigma above)
p0 = [1., 50., 25.]
bins = np.arange(0, 150, 5)
bin_centres = (bins[:-1] + bins[1:])/2
# create histograms for fitting and fit curves
hist_original, bin_edges = np.histogram(line_lengths, bins=bins, density=True)
hist_truncated, bin_edges = np.histogram(truncated_line_lengths, bins=bins, density=True)
hist_fixed, bin_edges = np.histogram(fixed_line_lengths, bins=bins, density=True)
coeff_original, var_matrix = curve_fit(gauss, bin_centres, hist_original, p0=p0)
coeff_truncated, var_matrix = curve_fit(gauss, bin_centres, hist_truncated, p0=p0)
coeff_fixed, var_matrix = curve_fit(gauss, bin_centres, hist_fixed, p0=p0)
# Create histograms of fitted curves
hist_original_fit = gauss(bin_centres, *coeff_original)
hist_truncated_fit = gauss(bin_centres, *coeff_truncated)
hist_fixed_fit = gauss(bin_centres, *coeff_fixed)
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(20,5))
ax1.set_title('Original')
ax2.set_title('Truncated')
ax3.set_title('Fixed')
ax1.plot(bin_centres, hist_original, color='r',label='Test data')
ax1.plot(bin_centres, hist_original_fit, color='b', label='Fitted data')
ax2.plot(bin_centres, hist_truncated, color='r',label='Test data')
ax2.plot(bin_centres, hist_truncated_fit, color='b', label='Fitted data')
ax3.plot(bin_centres, hist_fixed, color='r',label='Test data')
ax3.plot(bin_centres, hist_fixed_fit, color='b', label='Fitted data')
plt.show()
print 'Original - Mean: {}, SD: {}'.format(coeff_original[1], coeff_original[2])
print 'Truncated - Mean: {}, SD: {}'.format(coeff_truncated[1], coeff_truncated[2])
print 'Fixed - Mean: {}, SD: {}'.format(coeff_fixed[1], coeff_fixed[2])