% Principal Component Analysis without Tears
%
% This is a lecture on PCA with pseudo-Latex notes and Matlab code
% Xiaojin Zhu, jerryzhu@cs.wisc.edu
% 10/18/07

% image -> D-dim feature vector x -> point in D-dim

load 38
x=double([three eight]); % 400 digits, half half
x(:,1)
imagesc(reshape(x(:,1), 16, 16)); colormap gray


% w direction 
w=rand(256,1);
w=w/sqrt(w'*w);

% projection w'x (e.g. when w=axis), length=w'x/norm(w)
w'*x(:,1)
hist(w'*x)

% variance of the projected dataset
% 1/n sum (w'x - w'mu)^2
var(w'*x, 1)

% PCA goal: find w to maximize the variance (preserve differences)
% max_w 1/n sum(w'x-w'mu)^2 
%       = w' S w
% s.t. w'w=1
% where S=1/n sum (x-mu)(x-mu)' is the covariance matrix
mu=mean(x,2);
S=(x-repmat(mu,1,400))*(x-repmat(mu,1,400))'/400;
imagesc(S); colorbar

% Lagrangian w'Sw + lambda (1-w'w)
% grad = 1/2 Sw - 1/2 lambda w = 0
% Sw=lambda w, w is an eigenvector of S!
% w'Sw = lambda w'w = lambda => want largest eigenvalue.
[evec, eval]=eigs(S,1);

% verify that it preserves more variance than the random direction w
var(w'*x, 1)
var(evec'*x, 1)

% if instead of a projection direction, we want a M dimensional projection space,
% these are the largest M eigenvectors/values.
M=3;
[evec, eval]=eigs(S,M);
var(evec(:,1)'*x, 1)
var(evec(:,2)'*x, 1)
var(evec(:,3)'*x, 1)
eval
% why var()=eval? var=w'Sw = lambda w'w = lambda

% geometric view: M major axes
% M can at most be D, then M is a *rotated* coordinate system aligned with major axes

%-------------------------------------------------
% Now a different view of PCA: dimensionality reduction.
% Each digit image needs 256 numbers (pixels).
% I'll show you how to use only M numbers! (Dimensionality reduction)

% first center the data: y=x-mu
y = x - repmat(mu,1,400);

% for any orthonomal basis u1...uD, yi can be written as
% 	yi = sum_{j=1}^D alpha_ij uj
% where
%	alpha_ij = uj' yi.
% consider the M-term approximation of yi
% 	zi = sum_{j=1}^M alpha_ij uj
% we want zi to be close (squared difference) to yi, for all images i=1..n.
% the difference is
%	J = 1/n sum_{i=1}^n (yi-zi)'(yi-zi)
%	  = 1/n sum_i || sum_{j=M+1}^D alpha_ij uj ||^2
%	  = 1/n sum_i sum_{j=M+1}^D alpha_ij^2
%	  = 1/n sum_i sum_{j=M+1}^D uj' yi yi' uj
%	  = sum_{j=M+1}^D uj' (1/n sum_i yi yi') uj
%	  = sum_{j=M+1}^D uj' S uj
% where 
%	S=1/n sum yi yi'
%	 =1/n sum (xi-mu)(xi-mu)' 
% is the SAME covariance matrix as above.
% We want to minimize J by choosing uj for j=M+1..D.
% This can be achieved by letting uj be the *smallest* eigenvectors of S.
% Note we throw these away from yi to zi.
% Therefore zi *retains* (or uses) the M *largest* eigenvectors.
%
% Here's the algorithm: 
% 1. Center the dataset
y = x - repmat(mu,1,400);

% 2. Compute the M largest eigenvector/values
M=3;
[evec, eval]=eigs(S,M);

% 3. For yi, we approximate it by zi=sum_{j=1}^M alpha_ij uj
% and we will store it as alpha_i{1..M}
% where
%	alpha_ij = uj' yi.
% This is dimensionality reduction if M<D!
alpha=evec'*y;

% 4. To recover the original xi, do
%    xi = yi + mu ~= zi + mu = sum_{j=1}^M alpha_ij uj + mu
xapprox = evec*alpha + repmat(mu, 1, 400);

% let's compare the images
imagesc(reshape(x(:,1), 16, 16)); colormap gray
imagesc(reshape(xapprox(:,1), 16, 16)); colormap gray
% repeat this for different M 

% the uj's are the new axes.  look what they're like
imagesc(reshape(evec(:,1), 16, 16)); colorbar; colormap gray

% usage of PCA 1: visualization
plot3(alpha(1,1:200), alpha(2,1:200), alpha(3,1:200), 'ro')
hold on
plot3(alpha(1,201:400), alpha(2,201:400), alpha(3,201:400), 'b+')
xlabel('\alpha1')
ylabel('\alpha2')
zlabel('\alpha3')
hold off

% usage of PCA 2: extrapolate to generate novel images
newalpha1=[500; -400; 0];
newx = evec*newalpha1 + mu;
imagesc(reshape(newx(:,1), 16, 16)); colormap gray

newalpha2=[-500; 0; 0];
newx = evec*newalpha2 + mu;
imagesc(reshape(newx(:,1), 16, 16)); colormap gray

for i=0:0.1:1,
  newalpha = (1-i)*newalpha1 + i*newalpha2
  newx = evec*newalpha + mu;
  imagesc(reshape(newx(:,1), 16, 16)); colormap gray
  pause
end


% PCA is an *unsupervised* technique.
% When it can go wrong: classification (supervised learning).
% In that case, the right thing to do is Fisher's Linear Discriminant Analysis.