teg_PCA

Does principal components analysis (PCA) without calculating the covariance matrix, using the NIPALS algorithm. This makes it usable where the normal method would cause an Out of Memory error.

Usage: [L, L2O, O2L, EV, PEx] = teg_PCA(X).

function [S, L2O, O2L, EV, PEx] = teg_PCA(varargin)

% function [L, L2O, O2L, EV, PEx] = teg_PCA(X, {nCompsMax or expl. var or 'scree'})
%
% Implementation of the NIPALS algorithm.
%
% Input:
%   X: data matrix. Columns are variables, rows are observations.
%   Optional: Stopping criteria:
%       nCompsMax: 1, 2, 3, ... Maximum number of components to extract
%       Expl. var: [0, 1). Proportion of variance to explain.
%       'scree': Stop including components when they explain less than the
%       average amount of variance.
% Output:
%   L: matrix of latent variable scores
%   L2O: matrix of mixing weights: O = L * L2O
%   O2L: matrix of unmixing weights: L = O * O2L
%   EV: matrix of eigenvalues
%   PEx: proportion of explained variance per number of components
%
% Thomas E. Gladwin, 31-03-09

X = varargin{1};
nCompsTag = -1;
scree = 0;
PEx_req = 1;
if length(varargin) > 1,
    par0 = varargin{2};
    if strcmp(varargin{2}, 'scree'),
        scree = 1;
    elseif par0 >= 1,
        nCompsTag = varargin{2};
    else,
        PEx_req = varargin{2};
    end;
end;

[nR, nC] = size(X);
for iC = 1:size(X, 2),
    mX = mean(X(:, iC));
    X(:, iC) = X(:, iC) - mean(X(:, iC));
end;
Full = X;
varX = sum(var(X));
screecrit = varX / min(nR, nC);
S = [];
W = [];
D = [];
AIC = [];
PEx = [];
err0 = [];
eps0 = 0.00000001;

i0 = 1;
error_old = sum(sum(X .^ 2)) / size(X, 1);
err0 = 1;
err00 = error_old;

while 1 == 1,
    random_column = 1 + floor(rand * nC);
    s = X(:, random_column);
    eig_old = s' * s;
    step_error = eps0 * eig_old + 1;

    fprintf(['Component ' num2str(i0) '\t']);

    errstr = '';
    errperiod = 0.1;
    tic;

    it0 = 0;
    max_its = min(size(X, 1), size(X, 2));
    while step_error > eps0 * eig_old,

        if it0 > max_its,
            it0 = it0 + 1;
            break;
        end;
        
        % Projection of X onto s: find loadings
        w = (X' * s) / (s' * s);

        % Normalize loading vector
        w = w / sqrt(w' * w);

        % Project X onto w: find new score vector
        s_new = (X * w) / (w' * w);

        % Check for convergence
        eig_new = s_new' * s_new;
        step_error = abs(eig_old - eig_new);
        
        eig_old = eig_new;
        s = s_new;

        if toc > errperiod,
            tic;
            errstr = overwrite_str(errstr, step_error, eig_old, w);
        end;
    end;
    errstr = overwrite_str(errstr, step_error, eig_old, w);
    S = [S s];
    W = [W; w'];
    D(i0, i0) = eig_old / size(X, 1);
    for iC = 1:size(X, 2),
        v = s * w(iC)';
        X(:, iC) = X(:, iC) - v;
    end;
    dD = diag(D);
    PEx = [PEx; sum(dD) / varX];
    fprintf(['\t' num2str(PEx(end))]);
    fprintf('\n');
    % AIC (p. 48)
    error_new = 0;
    for iC = 1:size(X, 2),
        est = S * W(:, iC);
        error_new = error_new + sum((Full(:, iC) - est) .^ 2) / length(v);
    end;
    err0 = [err0; error_new / err00];
    error_old = error_new;
    % To next iteration, or break
    if scree == 1 && D(i0, i0) <= screecrit || isnan(D(i0, i0)),
        S = S(:, 1:(end - 1));
        W = W(1:(end - 1), :);
        D = D(1:(end - 1), 1:(end - 1));
        PEx = PEx(1:(end - 1));
        fprintf(['Scree crit or NaN eigenvalue hit: removing last component.\n']);
        break;
    end;
    if i0 == nCompsTag + 1 || i0 >= min(size(X, 1), size(X, 2)) || PEx(end) >= PEx_req,
        break;
    end;
    i0 = i0 + 1;
end;

L2O = W;
O2L = L2O' * inv(L2O * L2O');
EV = diag(D);

function errstr = overwrite_str(errstr, step_error, eig_old, w)
for si = 1:length(errstr),
    fprintf('\b');
end;
errstr = num2str([eig_old step_error / eig_old]);
fprintf(errstr);