Source code for menpo.math.decomposition

from __future__ import division
import numpy as np
from scipy.linalg.blas import dgemm


[docs]def eigenvalue_decomposition(S, eps=10**-10): r""" Parameters ---------- S : (N, N) ndarray Covariance/Scatter matrix Returns ------- pos_eigenvectors: (N, p) ndarray pos_eigenvalues: (p,) ndarray """ # compute eigenvalue decomposition eigenvalues, eigenvectors = np.linalg.eigh(S) # sort eigenvalues from largest to smallest index = np.argsort(eigenvalues)[::-1] eigenvalues = eigenvalues[index] eigenvectors = eigenvectors[:, index] # set tolerance limit limit = np.max(np.abs(eigenvalues)) * eps # select positive eigenvalues pos_index = eigenvalues > 0.0 pos_eigenvalues = eigenvalues[pos_index] pos_eigenvectors = eigenvectors[:, pos_index] # check they are within the expected tolerance index = pos_eigenvalues > limit pos_eigenvalues = pos_eigenvalues[index] pos_eigenvectors = pos_eigenvectors[:, index] return pos_eigenvectors, pos_eigenvalues
[docs]def principal_component_decomposition(X, whiten=False, center=True, bias=False, inplace=False): r""" Apply PCA on the data matrix X. In the case where the data matrix is very large, it is advisable to set `inplace=True`. However, note this this destructively edits the data matrix by subtracting the mean inplace. Parameters ---------- x : (n_samples, n_features) ndarray Training data whiten : bool, optional Normalise the eigenvectors to have unit magnitude Default: `False` center : bool, optional Whether to center the data matrix. If `False`, zero will be subtracted. Default: `True` bias : bool, optional Whether to use a biased estimate of the number of samples. If `False`, subtracts `1` from the number of samples. Default: `False` inplace : bool, optional Whether to do the mean subtracting inplace or not. This is crucial if the data matrix is greater than half the available memory size. Default: `False` Returns ------- eigenvectors : (n_components, n_features) ndarray The eigenvectors of the data matrix eigenvalues : (n_components,) ndarray The positive eigenvalues from the data matrix mean_vector : (n_components,) ndarray The mean that was subtracted from the dataset """ n_samples, n_features = X.shape if bias: N = n_samples else: N = n_samples - 1.0 if center: # center data mean_vector = np.mean(X, axis=0) else: mean_vector = np.zeros(n_features) # This is required if the data matrix is very large! if inplace: X -= mean_vector else: X = X - mean_vector if n_features < n_samples: # compute covariance matrix # S: n_features x n_features S = dgemm(alpha=1.0, a=X.T, b=X.T, trans_b=True) / N # S should be perfectly symmetrical, but numerical error can creep # in. Enforce symmetry here to avoid creating complex # eigenvectors from eigendecomposition S = (S + S.T) / 2.0 # perform eigenvalue decomposition # eigenvectors: n_features x n_features # eigenvalues: n_features eigenvectors, eigenvalues = eigenvalue_decomposition(S) if whiten: # whiten eigenvectors eigenvectors *= np.sqrt(1.0 / eigenvalues) else: # n_features > n_samples # compute covariance matrix # S: n_samples x n_samples S = dgemm(alpha=1.0, a=X.T, b=X.T, trans_a=True) / N # S should be perfectly symmetrical, but numerical error can creep # in. Enforce symmetry here to avoid creating complex # eigenvectors from eigendecomposition S = (S + S.T) / 2.0 # perform eigenvalue decomposition # eigenvectors: n_samples x n_samples # eigenvalues: n_samples eigenvectors, eigenvalues = eigenvalue_decomposition(S) # compute final eigenvectors # eigenvectors: n_samples x n_features if whiten: w = (N * eigenvalues) ** -1.0 else: w = np.sqrt(1.0 / (N * eigenvalues)) eigenvectors = w * dgemm(alpha=1.0, a=X.T, b=eigenvectors.T, trans_b=True) # transpose eigenvectors # eigenvectors: n_samples x n_features eigenvectors = eigenvectors.T return eigenvectors, eigenvalues, mean_vector