The notebook is better viewed rendered as slides. You can convert it to slides and view them by:
$ ipython nbconvert --to slides --post serve <this-notebook-name.ipynb>
This and other related IPython notebooks can be found at the course github repository:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
from scipy.stats import norm, multivariate_normal
import math
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rc('text', usetex=True)
plt.rc('font', family='serif')
plt.rcParams['text.latex.preamble'] ='\\usepackage{libertine}\n\\usepackage[utf8]{inputenc}'
import seaborn
seaborn.set(style='whitegrid')
seaborn.set_context('notebook')
Random variable: a variable whose value is subject to variations due to chance. A random variable can take on a set of possible different values, each with an associated probability, in contrast to other mathematical variables.
Probability distribution: mathematical function describing the possible values of a random variable and their associated probabilities.
Probability density function (pdf) of a continuous random variable is a function that describes the relative likelihood for this random variable to take on a given value.
The probability distribution of a random variable is often characterised by a small number of parameters, which also have a practical interpretation.
sample1 = np.random.normal(0, 0.5, 1000)
sample2 = np.random.normal(1,1,500)
def plot_normal_sample(sample, mu, sigma):
'Plots an histogram and the normal distribution corresponding to the parameters.'
x = np.linspace(mu - 4*sigma, mu + 4*sigma, 100)
plt.plot(x, norm.pdf(x, mu, sigma), 'b', lw=2)
plt.hist(sample, 30, normed=True, alpha=0.2)
plt.annotate('3$\sigma$',
xy=(mu + 3*sigma, 0), xycoords='data',
xytext=(0, 100), textcoords='offset points',
fontsize=15,
arrowprops=dict(arrowstyle="->",
connectionstyle="arc,angleA=180,armA=20,angleB=90,armB=15,rad=7"))
plt.annotate('-3$\sigma$',
xy=(mu -3*sigma, 0), xycoords='data',
xytext=(0, 100), textcoords='offset points',
fontsize=15,
arrowprops=dict(arrowstyle="->",
connectionstyle="arc,angleA=180,armA=20,angleB=90,armB=15,rad=7"))
plt.figure(figsize=(11,4))
plt.subplot(121)
plot_normal_sample(sample1, 0, 0.5)
plt.title('Sample 1: $\mu=0$, $\sigma=0.5$')
plt.subplot(122)
plot_normal_sample(sample2, 1, 1)
plt.title('Sample 2: $\mu=1$, $\sigma=1$')
plt.tight_layout();
print('Sample 1; estimated mean:', sample1.mean(), ' and std. dev.: ', sample1.std())
print('Sample 2; estimated mean:', sample2.mean(), ' and std. dev.: ', sample2.std())
Covariance is a measure of how much two random variables change together. $$ \operatorname{cov}(X,Y) = \operatorname{E}{\big[(X - \operatorname{E}[X])(Y - \operatorname{E}[Y])\big]}, $$ $$ \operatorname{cov}(X,X) = s(X), $$
sample_2d = np.array(list(zip(sample1, np.ones(len(sample1))))).T
plt.scatter(sample_2d[0,:], sample_2d[1,:], marker='x');
np.cov(sample_2d) # computes covariance between the two components of the sample
As the sample is only distributed along one axis, the covariance does not detects any relationship between them.
What happens when we rotate the sample?
def rotate_sample(sample, angle=-45):
'Rotates a sample by `angle` degrees.'
theta = (angle/180.) * np.pi
rot_matrix = np.array([[np.cos(theta), -np.sin(theta)],
[np.sin(theta), np.cos(theta)]])
return sample.T.dot(rot_matrix).T
rot_sample_2d = rotate_sample(sample_2d)
plt.scatter(rot_sample_2d[0,:], rot_sample_2d[1,:], marker='x');
np.cov(rot_sample_2d)
mu = [0,1]
cov = [[1,0],[0,0.2]] # diagonal covariance, points lie on x or y-axis
sample = np.random.multivariate_normal(mu,cov,1000).T
plt.scatter(sample[0], sample[1], marker='x', alpha=0.29)
estimated_mean = sample.mean(axis=1)
estimated_cov = np.cov(sample)
e_x,e_y = np.random.multivariate_normal(estimated_mean,estimated_cov,500).T
plt.plot(e_x,e_y,'rx', alpha=0.47)
x, y = np.mgrid[-4:4:.01, -1:3:.01]
pos = np.empty(x.shape + (2,))
pos[:, :, 0] = x; pos[:, :, 1] = y
rv = multivariate_normal(estimated_mean, estimated_cov)
plt.contour(x, y, rv.pdf(pos), cmap=cm.viridis_r, lw=4)
plt.axis('equal');
fig = plt.figure(figsize=(11,5))
ax = fig.gca(projection='3d')
ax.plot_surface(x, y, rv.pdf(pos), cmap=cm.viridis_r, rstride=30, cstride=10, linewidth=1, alpha=0.47)
ax.plot_wireframe(x, y, rv.pdf(pos), linewidth=0.47, alpha=0.47)
ax.scatter(e_x, e_y, 0.4, marker='.', alpha=0.47)
ax.axis('tight');
Again, what happens if we rotate the sample?
rot_sample = rotate_sample(sample)
estimated_mean = rot_sample.mean(axis=1)
estimated_cov = np.cov(rot_sample)
e_x,e_y = np.random.multivariate_normal(estimated_mean,estimated_cov,500).T
fig = plt.figure(figsize=(11,4))
plt.subplot(121)
plt.scatter(rot_sample[0,:], rot_sample[1,:], marker='x', alpha=0.7)
plt.title('"Original" data')
plt.axis('equal')
plt.subplot(122)
plt.scatter(e_x, e_y, marker='o', color='g', alpha=0.7)
plt.title('Sampled data')
plt.axis('equal');
Covariance captures the dependency and can model disposition of the "original" sample.
x, y = np.mgrid[-4:4:.01, -3:3:.01]
pos = np.empty(x.shape + (2,))
pos[:, :, 0] = x; pos[:, :, 1] = y
rv = multivariate_normal(estimated_mean, estimated_cov)
fig = plt.figure(figsize=(11,5))
ax = fig.gca(projection='3d')
ax.plot_surface(x, y, rv.pdf(pos), cmap=cm.viridis_r, rstride=30, cstride=10, linewidth=1, alpha=0.47)
ax.plot_wireframe(x, y, rv.pdf(pos), linewidth=0.47, alpha=0.47)
ax.scatter(e_x, e_y, 0.4, marker='.', alpha=0.47)
ax.axis('tight');