In [1]:

%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.datasets import make_low_rank_matrix

In [2]:

X = make_low_rank_matrix(50, 100, effective_rank=10)

total_variance = np.var(X, axis=0).sum()

Let's check that the explained variance ratio estimated internally by a full PCA model matches the true explained variance ratio as measured from the data:

In [3]:

pca = PCA().fit(X)
plt.plot(pca.explained_variance_ratio_, label='pca explained variance')

X_transformed = pca.transform(X)
variances = np.var(X_transformed, axis=0)
true_explained_variance_ratio = variances / total_variance

plt.plot(true_explained_variance_ratio, label='empirical explained variance')
plt.legend(loc='best')
plt.xlabel('pca components')
plt.ylabel('fraction of total variance')
None

If the PCA is only partially computed (early truncation), this can no longer be estimated:

In [4]:

pca = RandomizedPCA(n_components=20).fit(X)

plt.plot(pca.explained_variance_ratio_, label='truncated pca explained variance')

X_transformed = pca.transform(X)
variances = np.var(X_transformed, axis=0)
true_explained_variance_ratio = variances / total_variance

plt.plot(true_explained_variance_ratio, label='empirical explained variance')
plt.legend(loc='best')
plt.xlabel('pca components')
plt.ylabel('fraction of total variance')
None

Because of the truncation, the RandomizedPCA model is ignoring ~27% of the total variance of the data in this explained_variance_ratio_ estimate is lying.

In [5]:

pca.explained_variance_ratio_.sum()

Out[5]:

1.0

In [6]:

true_explained_variance_ratio.sum()

Out[6]:

0.73030800688027508

In [ ]: