%pylab inline from __future__ import division import utils import seaborn seaborn.set() colors = seaborn.color_palette() # For example if we have some data X X1 = randn(100, 2) plot(X1[:, 0], X1[:, 1], "o"); # And we multiple that data by a matrix A A = array([[1, .5], [.5, 1]]) X2 = dot(X1, A) plot(X2[:, 0], X2[:, 1], "o"); # Multiplying by the inverse of matrix A will undue the # linear transformation of X that was obtained by multiplying by A Ainv = inv(A) X1b = dot(X2, Ainv) plot(X1b[:, 0], X1b[:, 1], "o"); # We can check this assertion formally in Python with the caveat that there # is some very small numerical rounding errors. assert all(X1 - X1b < 1e-5), "Matrix 1 does not equal matrix 2" # We can also get built-in assertion with consideration of floating point precision from numpy.testing import assert_array_almost_equal assert_array_almost_equal(X1, X1b) # Another way to say this is that the matrix obtained by multiplying A with # its inverse is a matrix that will not change the values of X. This matrix # is known as the identity matrix and contains 1s on the diagonal and 0s in # every other entry. I = dot(A, inv(A)) assert all(I == eye(I.shape[0])) # Fabricate a dataset with 4 variables X = randn(100, 4) # Fabricate a variable y that depends on these four variables and some noise noise = randn(100) * 10 y = 1 * X[:, 0] + 7 * X[:, 1] + 3 * X[:, 2] + 5 * X[:, 3] + 7 + noise # To capture the constant term, we will add a column of 1s to our regressor matrix X = column_stack((X, ones(100))) # Fit the model based on the ordinary last squares (OLS) formula w_ols = dot(dot(inv(dot(X.T, X)), X.T), y) # Note how the lack of builtin matrix multiplication for arrays is a bit annoying # We can also use matrix objects X_mtx = asmatrix(X) y_mtx = asmatrix(y).T w_ols2 = inv(X_mtx.T * X_mtx) * X_mtx.T * y_mtx # We see these are the same (once we account for the 2D nature of the matrix assert_array_almost_equal(w_ols.reshape(w_ols2.shape), w_ols2) # Use numpy's lstsq function to fit the model # lstsq returns a 4-tuple w_lstsq, residue, rnk, s = lstsq(X, y) # You can also do w_lstsq = lstsq(X, y)[0] # If you are following the MATLAB tutorial, note that the argument order for lstsq is backward relative to regress # These are also identical assert_array_almost_equal(w_ols, w_lstsq) # First we compute our models prediction modelprediction = dot(X, w_ols) # Visualize this prediction by plotting it against our data # The imperfections arise from the noise we added to the data plot(modelprediction, y, "o", color=colors[1]) xlabel("model prediction") ylabel("original data"); # Then we subract the model prediction from the measured data, square that difference and take the mean mse_ols = mean((y - modelprediction) ** 2) print "Our mean squared error is %.4f using ordinary least squares regression" % mse_ols # Various solvers live in the optimize package from scipy import optimize # Define an anonymous function that returns the mean squared error between our data # and the result of multiplying the predictor matrix by a weight vector costfunc = lambda w: mean(square(y - dot(X, w))) # Start the search with all weights at 0 seed = zeros(5) # Fit the model by searching over a space of cost function values and finding the lowest # Set the limit on function evaluations to infinity optsol = optimize.fmin(costfunc, seed, maxfun=inf) axes(aspect="equal") plot(w_ols, optsol, "D", color=colors[2], markersize=7) xlabel("OLS solution") ylabel("Optimize solution"); mse_optim = mean(square(y - dot(X, optsol))) print "OLS MSE: %.4f" % mse_ols print "Optimized MSE: %.4f " % mse_optim