#!/usr/bin/env python # coding: utf-8 # In[1]: # Probabilistic curve fitting, express # uncertainty over target t using a gaussian # (aka normal) distribution get_ipython().run_line_magic('pylab', 'inline') def gaussian(x, mu, sigma): # vectorised for multiple x, sigma # returning x varying in the 1st dimension (down), sigma in the 2nd (right) numerator = np.exp(-0.5 * array(mat(x - mu).T * mat(1./sigma))**2) return 1./(np.sqrt(2*np.pi)*sigma)*numerator def cond_pdf_t(t, x, w, sigma): """Probability of t given x, y, sigma""" return gaussian(t, polyval(w,x), sigma) from IPython.display import display, Math, Latex # In[2]: # Do polynomial fitting like before, for now just # trusting the built-in polyfit # We sample from a sin curve with added gaussian noise # of std dev 0.3 N = 15 # training points M = 4 # polynomial degree x = arange(0, 2*pi, (2*pi)/N) noise = 0.3 * randn(N) t = sin(x) + noise plot(x,t,'ro') w = polyfit(x, t, M) xs = arange(0,2*pi,.01) plot(xs, polyval(w, xs), 'g') ylabel("t") xlabel("x") # In[3]: # Consider the probability distibution for # the predicted value of t at a certain x # position, eg one of our training points xvalue = x[5] # From above, we are assuming it is distributed # normally, with mean given as the value of our # polynomial, and for now lets pick 0.3 as a standard # deviation (cheating, because we know that is the # real value from the noise) tmean = polyval(w, xvalue) sigma = 0.3 # We plot over the domain +-4 standard deviations, # using the conditional probability function defined earlier ts = arange(tmean-4*sigma, tmean+4*sigma, 0.01) pdf_t = cond_pdf_t(ts, xvalue, w, sigma) plot(ts, pdf_t) xlabel("t") title("$p(t|x,$w$,\sigma)$", fontsize=20) # As expected, it is just a normal distribution # In[4]: plot(x,t,'ro') plot(xs, polyval(w, xs), 'g') # Now, to illustrate, we plot this gaussian sideways # over our graph, showing how its width (standard deviation) # expresses our uncertainty in the value of t def plot_sideways(x, y, zeropos, *args, **kwargs): # Plot sideways by switching x and y, adding zeropos # to our y to change the location plot(y + zeropos, x, *args, **kwargs) # Draw a vertical line for the axis vlines(zeropos, min(x), max(x), *args, **kwargs) plot_sideways(ts, pdf_t, xvalue, 'b') ylabel("t") xlabel("x") text(xvalue + max(pdf_t), polyval(w,xvalue), " $p(t|x,$w$,\sigma)$") # We can also plot probability as color in the 2d plane. # In[5]: X,Y = meshgrid(arange(0,7,0.1),arange(-1,2.1,0.1)) Z = [cond_pdf_t(Yi,Xi,w,sigma) for Xi,Yi in zip(X,Y)] Z = array(Z).reshape(X.shape) jet() contourf(arange(0,7,0.1), arange(-1,2.1,0.1), Z, levels=40) colorbar() xlim([0,6.9]) ylim([-1,2]) scatter(x,t,c='w') # In[6]: # So, now the question is how do we learn the weights w # and the standard deviation sigma from our training data # Well, we can see how probable each training point is given some # weights and standard deviation using our conditional probability # density function (now working on a vector of training data rather than a scalar) # (note these values are not probabilities, hence may be >1) print(cond_pdf_t(t, x, w=[0.01, -0.02, -0.5, 1.7, -0.4], sigma=0.3)) # In[7]: # But we want to know how probable all the data is -- for this # we use the product rule, assuming the data was drawn independently, # simply multiplying all the previous values p = product(cond_pdf_t(t, x, w=[0.01, -0.02, -0.5, 1.7, -0.4], sigma=0.3), 0) Math("$p(\mathbf{t}|\mathbf{x},\mathbf{w},\sigma) = \prod_{n=1}^{N} Norm(t_n|y(x_n,\mathbf{w}),\sigma) = %f$" % p) # This is the value we want to maximise - try tweaking the parameters # to get a high value, it probably isn't easy! # In[8]: # When considered as a function of w and sigma, this is our # likelihood function, and is *not* a probability distribution # over these variables. Note that syntactically it is the same # expression as a conditional probability dist on t, it is just # a matter of which variables we "fix" and which we "slide around". def likelihood_w_sigma(w, sigma): return product(cond_pdf_t(t, x, w, sigma), 0) # We can plot likelihood against different standard deviation # to see where the maximum is - how does this compare to the sd # we actually set for the noise? sigs = arange(0.02,1,0.01) plot(sigs, likelihood_w_sigma(w, sigs)) xlabel("$\sigma$", fontsize=22) ylabel("Likelihood") # In[9]: # In practice we consider the log of the likelihood, which lets us # turn the product into a sum and avoids numerical underflow # from multiplying small values. We want to maximise the log likelihood, # or equivalently, minimise the negative log likelihood def nlog_likelihood_w_sigma(w, sigma): # log(product(cond_pdf_t(t, x, w, sigma),0)) # = sum(log(cond_pdf_t(t, x, w, sigma)), 0) # using negative: return -sum(log(cond_pdf_t(t, x, w, sigma)), 0) plot(sigs, nlog_likelihood_w_sigma(w, sigs)) xlabel('$\sigma$') ylabel('NLL') # Maximum Likelihood Solution # =========================== # # The Maximum Likelihood solution can be found in closed form. # # First we can use the properties of the (natural) log to simplify the formula # # $\log(p(\mathbf{t}|\mathbf{x}, \mathbf{w},\sigma)) = \log(\prod_{n=1}^{N} Norm(t_n|y(x_n,\mathbf{w}),\sigma))$ # # $ = \sum_{n=1}^{N} \log[(2\pi\sigma^2)^{-1/2} \exp(-\frac{(t_n-y(x_n,\mathbf{w}))^2}{2\sigma^2})]$ # # $ = N \log((2\pi\sigma^2)^{-1/2}) -\frac{1}{2\sigma^2} \sum_{n=1}^{N} (t_n-y(x_n,\mathbf{w}))^2 $ # # $ = -\frac{1}{2\sigma^2} \sum_{n=1}^{N} (t_n-y(x_n,\mathbf{w}))^2 - \frac{N}{2}\log(\sigma^2) - \frac{N}{2}\log(2\pi) $ # # Now we can find the gradient wrt $\mathbf{w}$ and $\sigma^2$ # # (TODO) # In polynomial regression, the hypothesis, $y$, is of the form: # # $y(x_n, \mathbf{w}) = \sum_{i=0}^{M} w_i x^{i}$, where $M-1$ is the degree of the polynomial (we started at 0). # # Using matrix notation, let $\mathbf{w} = w_1 ... w_M$, a vector of weights. Also, let $\mathbf{x_n} = x_n^{0}...x_n^{M}$, a vector of the first $M$ powers of $x_n$. # # Then, $y(x_n, \mathbf{w}) = \mathbf{w}^{T}\mathbf{x_n}$ # # The derivative of $y(x_n, \mathbf{w})$ wrt $\mathbf{w}$ will come in handy soon, and we can see that: $\frac{d y}{d \mathbf{w}} = \mathbf{x_n}$ by the linearity of $\mathbf{w}^{T}\mathbf{x_n}$. # # Another point of notation, we'll introduce the design matrix, $X$, in which each training example is a row and each power of $x_n$ is a column. # # Taking the derivative of $-\frac{1}{2\sigma^2} \sum_{n=1}^{N} (t_n-y(x_n,\mathbf{w}))^2 - \frac{N}{2}\log(\sigma^2) - \frac{N}{2}\log(2\pi) $ with respect to $\mathbf{w}$, we obtain: # # $ -\sum_{n=1}^{N}(t_n - y(x_n, \mathbf{w})) \frac{d y}{d \mathbf{w}}$, using the fact that constants are irrelevant in minimization and we don't care about scalar multiples of the gradient. # # Replacing $\frac{d y}{d \mathbf{w}}$ with what we computed earlier, # # $ = -\sum_{n=1}^{N}(t_n - y(x_n, \mathbf{w}))\mathbf{x_n}$ # # $ = -\sum_{n=1}^{N}(t_n - \mathbf{w}^{T}\mathbf{x_n})\mathbf{x_n}$ # # Now using the design matrix to simplify things, and letting $\mathbf{t} = t_1 ... t_N$, the vector of targets, we can compute the $M$ dimensional gradient of the weights with respect to the error: # # $ = -\sum_{n=1}^{N}(t_n \mathbf{x_n}) + \sum_{n=1}^{N}(\mathbf{w}^{T}\mathbf{x_n})\mathbf{x_n}$ # # $ = -X^{T}\mathbf{t} + X^{T}(X\mathbf{w}) $ # # And since we are at an optima when this gradient is zero, # # $ 0 = -X^{T}\mathbf{t} + X^{T}(X\mathbf{w}) $ # # $ X^{T}\mathbf{t} = (X^{T}X)\mathbf{w} $ # # Almost there! Using the fact that $X^{T}X$ has a pseudoinverse, # # $ \mathbf{w} = (X^{T}X)^{-1}X^{T}\mathbf{t} $ # # And we have a closed form solution for the weights. This solution is refered to as the "normal equation". Note for big-data problems, the performance of this equation is pretty poor, and a much fa