%pylab inline from __future__ import division import utils import seaborn seaborn.set() colors = seaborn.color_palette("deep") # We will simulate a dataset with two variables # First we create a variable x x = arange(15) a = 2 # Scaling, scales y in relation to the same value of x b = 4 # Additive term, shifts y up in relation to the same value of x # We will create a secod variable as a linear trasformation of the first # one. We double x, we add 4 to each of its elements and add random # gaussian noise the size as x noise = randn(len(x)) # Note that we are effectively using the formula y = ax + b to simulate the data y = (a * x + b) + 3 * noise # Now we take a look at the relation between these two variables using a scatterplot plot(x, y, "o") xlabel("variable 1") ylabel("variable 2"); # To help visualize the relation between the two variables it is good to # plot them on axis with equal spacing (on x and y) and displaying equal # ranges of x and y's figure(figsize=(8, 8)) plot(x, y, "o") axis((-1, 35, -1, 35)) # Above we simulated the data using the relation y = ax + b. # Here we show that we can rewrite the same relation using matrix notation X = row_stack((x, ones_like(x))) # Notice that the way we have set up our matrix X each regressor is in a # row and each observation is in a column. The matrix X should be # transposed such that regressors are in columns and observations are in # the rows. This dimensionality is essential for the matrix multiplication # to work out properly X = X.T # Here we transpose X # We could have also done X = column_stack((x, ones_like(x))) # The matrix X can be multiplied by some weights (w). These weights # effectily shift and scale the data in x. These weights are the # coeficient of the linear equation; y = ax+b, we used to simulate the # data. w = row_stack((a, b)) # We can describe the simulated data by plotting the straight line defined # by the model y = ax+b. This is the model we used to tranform x into y y_model1 = a * x + b # Plot the model on top of the data plot(x, y_model1, linewidth=3) # Alternatively we can describe the data by using the same linear model # but written in matrix notation. # Note that unlike MATLAB, using the "*" operator on numpy arrays # performs element-wise multiplication. To perform real matrix # multiplication, we could use numpy.matrix objects, which can # be created with syntax similar to MATLAB mtx1 = matrix(["1 2; 3 4"]) mtx2 = matrix(X) # However, it is generally better practice to perform mulitiplication # using the numpy.dot() function # For those of you who are not familiar with linear algebra # our vector or weights denoted by w must be (1) a column vector and (2) # have as many entries as there are columns of X. This can be formalized # in the following line of code where we assert that the number of columns # in X is equal to the number of rows in w. assert X.shape[1] == w.shape[0], 'Dimension mismatch between X and w' # Because the dimensions match we can perform the following multiplication y_model2 = dot(X, w) # Plot the model # The two models, 1 and 2, are perfectly on top of eachother. We play here # with the the line style to make them a bit more visible. You # should see a continuous green line (model 1) and on top of it a red # dashed line (model 2). plot(x, y_model2, "--", linewidth=3); # First we create a variable, x x = arange(-15, 16) # We will now need three parameters a = 1.5 # Scaling, expands y in relaton to the same value of x. b = 2 # Additive term, shifts y up in relation to the same value of x. c = 1.5 # This is the term that decribe the influence of the non-linear # component. The greater this term the strongest the nonlinearity noise = randn(len(x)) * 6 y = c * x ** 2 + a * x + b + noise plot(x, y, ".") xlabel("variable 1") ylabel("variable 2") axis((-40, 40, -10, 400)); X = column_stack((x ** 2, x, ones_like(x))) # Plot the matrix matshow(X) # We will turn off the grid we have been using grid() # Use set the colormap for the whole image to be a shade of grays gray() # Turn on a colorbar to compare the values in the image colorbar() # Make the weight vector w = row_stack((c, a, b)) # Check the dimensions of w and X are correct sw = w.shape print "w is %d by %d" % sw sX = X.shape print "X is %d by %d" % sX # Formally check to make sure the dimensions match assert sw[0] == sX[1], "Dimension mismatch between X and w" # Build the model with matrix multiplication y_model3 = dot(X, w) # Plot the data and model plot(x, y, ".") axis((-40, 40, -10, 400)) plot(x, y_model3); # Suppose we collected some measure of height, weight and the corresponding "beauty" # on a set of individuals. height = array([1.75, 1.55, 1.82, 1.77, 1.8, 1.65, 1.6]) # in meters weight = array([75, 70, 72, 65, 68, 70, 68]) # in Kg. beauty = array([1, 4, 2, 7, 7, 4, 3]) # 1-7, 7 is most beautiful # Unlike the matlab code, we can do this in one step scatter(height, weight, beauty * 20, color=colors[2]) axis((1.5, 2, 60, 80)) xlabel("Height (meters)") ylabel("Weight (Kg)"); height_ind = 1.9 weight_ind = 67 distance = (height_ind - height) ** 2 + (weight_ind - weight) ** 2 # We can also do this with a function from the scipy package from scipy.spatial.distance import cdist pop = column_stack((height, weight)) sample = column_stack((height_ind, weight_ind)) distance2 = cdist(pop, sample) # By default, cdist computes the euclidian distance, but you can use many others # It also generalizes to n dimensions assert all(sqrt(distance) == distance2.ravel()) scatter(height, weight, beauty * 20, color=colors[2]) axis((1.5, 2, 60, 80)) xlabel("Height (meters)") ylabel("Weight (Kg)") # Find the closest individual idx = argmin(distance) # Look up their beauty predicted_beatuy = beauty[idx] # The nearest-neighbor model uses the closest data point to # predict the expected value of a new data point. In this way the model is # non parametric, because it uses the actual data (the closest point to the # new point) as the model (prediction). # Here we plot the new individual with the original group. scatter(height_ind, weight_ind, predicted_beatuy * 20, color=colors[0]);