import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)
Load the data.
# TODO: use the real data when it's posted.
covar = np.array([[ 2, -1, 0],
[-1, 2, -1],
[ 0, -1, 2]])
data = np.random.multivariate_normal([-3, 0, 2], covar, size=1000)
data
array([[-3.07278007, 0.99951483, 1.65069133], [-4.26254307, 1.8632204 , 0.26915018], [-5.4060972 , 1.80926327, 1.12877226], ..., [-3.70953051, 0.67185883, 1.43454321], [-2.09938491, -1.10424674, 3.05557613], [-2.2463543 , -1.70646397, 3.42199853]])
fig, axs = plt.subplots(3, 1, figsize=(10, 20))
for ax, (r1, r2) in zip(axs, ((0, 1), (1, 2), (0, 2))):
ax.scatter(data[:, r1], data[:, r2])
ax.set_xlabel('Response {0}'.format(r1))
ax.set_ylabel('Response {0}'.format(r2))
ax.set_aspect('equal')
Standardize the data and find the principal components.
data_standard = (data - data.mean(axis=0)) / np.std(data, axis=0)
data_standard
array([[-0.03362572, 0.66184781, -0.21502194], [-0.90530821, 1.29543891, -1.17178116], [-1.74313566, 1.25585742, -0.57646675], ..., [-0.50014234, 0.42148822, -0.36471108], [ 0.67953439, -0.88141489, 0.75790346], [ 0.57185694, -1.3231852 , 1.01166211]])
u, s, v = np.linalg.svd(data_standard, full_matrices=False)
u.shape, s.shape, v.shape
((1000, 3), (3,), (3, 3))
pca_coords = np.dot(u, np.diag(s))
pca_coords.shape
(1000, 3)
Scatter plots showing pairs of principal coordinates of the data.
fig, axs = plt.subplots(3, 1, figsize=(10, 20))
for ax, (c1, c2) in zip(axs, ((0, 1), (1, 2), (0, 2))):
ax.scatter(pca_coords[:, c1], pca_coords[:, c2])
ax.set_xlabel('Component {0}'.format(c1))
ax.set_ylabel('Component {0}'.format(c2))
ax.set_aspect('equal')
Histogram of the data along the largest principal component.
plt.hist(pca_coords[:, 0], bins=20)
plt.title('Distribution of the (standardized) data along the largest principal component')
<matplotlib.text.Text at 0xa7914ec>