%pylab inline

from __future__ import division
import utils
import seaborn
seaborn.set()
colors = seaborn.color_palette("deep")

# We will simulate a dataset with two variables
# First we create a variable x
x = arange(15)
a = 2 # Scaling, scales y in relation to the same value of x
b = 4 # Additive term, shifts y up in relation to the same value of x

# We will create a secod variable as a linear trasformation of the first
# one. We double x, we add 4 to each of its elements and add random
# gaussian noise the size as x
noise = randn(len(x))

# Note that we are effectively using the formula y = ax + b to simulate the data
y = (a * x + b) + 3 * noise

# Now we take a look at the relation between these two variables using a scatterplot
plot(x, y, "o")
xlabel("variable 1")
ylabel("variable 2");

# To help visualize the relation between the two variables it is good to
# plot them on axis with equal spacing (on x and y) and displaying equal
# ranges of x and y's
figure(figsize=(8, 8))
plot(x, y, "o")
axis((-1, 35, -1, 35))

# Above we simulated the data using the relation y = ax + b.
# Here we show that we can rewrite the same relation using matrix notation
X = row_stack((x, ones_like(x)))

# Notice that the way we have set up our matrix X each regressor is in a
# row and each observation is in a column.  The matrix X should be
# transposed such that regressors are in columns and observations are in
# the rows.  This dimensionality is essential for the matrix multiplication
# to work out properly
X = X.T # Here we transpose X

# We could have also done
X = column_stack((x, ones_like(x)))

# The matrix X can be multiplied by some weights (w). These weights
# effectily shift and scale the data in x.  These weights are the
# coeficient of the linear equation; y = ax+b, we used to simulate the
# data.
w = row_stack((a, b))

# We can describe the simulated data by plotting the straight line defined
# by the model y = ax+b. This is the model we used to tranform x into y
y_model1 = a * x + b

# Plot the model on top of the data
plot(x, y_model1, linewidth=3)

# Alternatively we can describe the data by using the same linear model
# but written in matrix notation.

# Note that unlike MATLAB, using the "*" operator on numpy arrays
# performs element-wise multiplication. To perform real matrix
# multiplication, we could use numpy.matrix objects, which can
# be created with syntax similar to MATLAB
mtx1 = matrix(["1 2; 3 4"])
mtx2 = matrix(X)

# However, it is generally better practice to perform mulitiplication
# using the numpy.dot() function

# For those of you who are not familiar with linear algebra
# our vector or weights denoted by w must be (1) a column vector and (2)
# have as many entries as there are columns of X.  This can be formalized
# in the following line of code where we assert that the number of columns
# in X is equal to the number of rows in w.  
assert X.shape[1] == w.shape[0], 'Dimension mismatch between X and w'

# Because the dimensions match we can perform the following multiplication
y_model2 = dot(X, w)

# Plot the model
# The two models, 1 and 2, are perfectly on top of eachother. We play here
# with the the line style to make them a bit more visible. You
# should see a continuous green line (model 1) and on top of it a red
# dashed line (model 2). 
plot(x, y_model2, "--", linewidth=3);

# First we create a variable, x
x = arange(-15, 16)

# We will now need three parameters
a = 1.5  # Scaling, expands y in relaton to the same value of x.
b = 2  # Additive term, shifts y up in relation to the same value of x.
c = 1.5  # This is the term that decribe the influence of the non-linear 
         # component. The greater this term the strongest the nonlinearity

noise = randn(len(x)) * 6
y = c * x ** 2 + a * x + b + noise

plot(x, y, ".")
xlabel("variable 1")
ylabel("variable 2")
axis((-40, 40, -10, 400));

X = column_stack((x ** 2, x, ones_like(x)))

# Plot the matrix
matshow(X)

# We will turn off the grid we have been using
grid()

# Use set the colormap for the whole image to be  a shade of grays
gray()

# Turn on a colorbar to compare the values in the image
colorbar()

# Make the weight vector
w = row_stack((c, a, b))

# Check the dimensions of w and X are correct
sw = w.shape
print "w is %d by %d" % sw

sX = X.shape
print "X is %d by %d" % sX

# Formally check to make sure the dimensions match
assert sw[0] == sX[1], "Dimension mismatch between X and w"

# Build the model with matrix multiplication
y_model3 = dot(X, w)

# Plot the data and model
plot(x, y, ".")
axis((-40, 40, -10, 400))
plot(x, y_model3);

# Suppose we collected some measure of height, weight and the corresponding "beauty" 
# on a set of individuals. 
height = array([1.75, 1.55, 1.82, 1.77, 1.8, 1.65, 1.6])  # in meters
weight = array([75, 70, 72, 65, 68, 70, 68])  # in Kg.
beauty = array([1, 4, 2, 7, 7, 4, 3])  # 1-7, 7 is most beautiful

# Unlike the matlab code, we can do this in one step
scatter(height, weight, beauty * 20, color=colors[2])
axis((1.5, 2, 60, 80))
xlabel("Height (meters)")
ylabel("Weight (Kg)");

height_ind = 1.9
weight_ind = 67

distance = (height_ind - height) ** 2 + (weight_ind - weight) ** 2

# We can also do this with a function from the scipy package
from scipy.spatial.distance import cdist
pop = column_stack((height, weight))
sample = column_stack((height_ind, weight_ind))
distance2 = cdist(pop, sample)

# By default, cdist computes the euclidian distance, but you can use many others
# It also generalizes to n dimensions
assert all(sqrt(distance) == distance2.ravel())

scatter(height, weight, beauty * 20, color=colors[2])
axis((1.5, 2, 60, 80))
xlabel("Height (meters)")
ylabel("Weight (Kg)")

# Find the closest individual
idx = argmin(distance)

# Look up their beauty
predicted_beatuy = beauty[idx]

# The  nearest-neighbor model uses the closest data point to
# predict the expected value of a new data point. In this way the model is
# non parametric, because it uses the actual data (the closest point to the
# new point) as the model (prediction).

# Here we plot the new individual with the original group.
scatter(height_ind, weight_ind, predicted_beatuy * 20, color=colors[0]);