In [5]:

import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)

Load the data.

In [6]:

# TODO: use the real data when it's posted.
covar = np.array([[ 2, -1,  0],
                  [-1,  2, -1],
                  [ 0, -1,  2]])
data = np.random.multivariate_normal([-3, 0, 2], covar, size=1000)
data

Out[6]:

array([[-3.07278007,  0.99951483,  1.65069133],
       [-4.26254307,  1.8632204 ,  0.26915018],
       [-5.4060972 ,  1.80926327,  1.12877226],
       ..., 
       [-3.70953051,  0.67185883,  1.43454321],
       [-2.09938491, -1.10424674,  3.05557613],
       [-2.2463543 , -1.70646397,  3.42199853]])

In [8]:

fig, axs = plt.subplots(3, 1, figsize=(10, 20))
for ax, (r1, r2) in zip(axs, ((0, 1), (1, 2), (0, 2))):
    ax.scatter(data[:, r1], data[:, r2])
    ax.set_xlabel('Response {0}'.format(r1))
    ax.set_ylabel('Response {0}'.format(r2))
    ax.set_aspect('equal')

Standardize the data and find the principal components.

In [9]:

data_standard = (data - data.mean(axis=0)) / np.std(data, axis=0)
data_standard

Out[9]:

array([[-0.03362572,  0.66184781, -0.21502194],
       [-0.90530821,  1.29543891, -1.17178116],
       [-1.74313566,  1.25585742, -0.57646675],
       ..., 
       [-0.50014234,  0.42148822, -0.36471108],
       [ 0.67953439, -0.88141489,  0.75790346],
       [ 0.57185694, -1.3231852 ,  1.01166211]])

In [10]:

u, s, v = np.linalg.svd(data_standard, full_matrices=False)
u.shape, s.shape, v.shape

Out[10]:

((1000, 3), (3,), (3, 3))

In [11]:

pca_coords = np.dot(u, np.diag(s))
pca_coords.shape

Out[11]:

(1000, 3)

Scatter plots showing pairs of principal coordinates of the data.

In [12]:

fig, axs = plt.subplots(3, 1, figsize=(10, 20))
for ax, (c1, c2) in zip(axs, ((0, 1), (1, 2), (0, 2))):
    ax.scatter(pca_coords[:, c1], pca_coords[:, c2])
    ax.set_xlabel('Component {0}'.format(c1))
    ax.set_ylabel('Component {0}'.format(c2))
    ax.set_aspect('equal')

Histogram of the data along the largest principal component.

In [13]:

plt.hist(pca_coords[:, 0], bins=20)
plt.title('Distribution of the (standardized) data along the largest principal component')

Out[13]:

<matplotlib.text.Text at 0xa7914ec>