Skip to content

Commit 02e94ec

Browse files
RobertRobert
Robert
authored and
Robert
committed
use PCA to reduce the dimensionality of hand written digit data from 256 to 2, and achieve better classification rate with 1-Nearest_Neighbor classifier
1 parent 00aebf3 commit 02e94ec

File tree

6 files changed

+156
-0
lines changed

6 files changed

+156
-0
lines changed

bonus.m

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
clear all;
2+
load digits;
3+
4+
5+
m = [2,5,10,20];
6+
numIter = size(m,2);
7+
avgErrTest = zeros(numIter,1);
8+
9+
for iter=1:numIter
10+
[avgErrTest(iter,1)] = pca_classifier(m(1,iter));
11+
end
12+
13+
14+
15+
figure;
16+
hold on;
17+
18+
plot(m,avgErrTest,'r-');
19+
20+
title('avg classification error rate VS PC');
21+
xlabel('number of Principle Component');
22+
ylabel('avg classification error rate');
23+
24+

digits.mat

406 KB
Binary file not shown.

knn.m

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
function [label_test] = knn(k, data_train, label_train, data_test)
2+
3+
error(nargchk(4,4,nargin));
4+
5+
D = size(label_train, 1);
6+
7+
dist = l2_distance(data_train, data_test);
8+
[sorted_dist, nearest] = sort(dist);
9+
nearest = nearest(1:k,:);
10+
11+
label_test = zeros(D, size(data_test, 2), k);
12+
for i=1:k
13+
label_test(:,:,i) = label_train(:, nearest(i, :));
14+
end
15+
16+
label_test = mean(label_test,3);
17+
label_test = label_test == repmat(max(label_test, [], 1), D, 1);

l2_distance.m

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
function d = L2_distance(a,b,df)
2+
% L2_DISTANCE - computes Euclidean distance matrix
3+
%
4+
% E = L2_distance(A,B)
5+
%
6+
% A - (DxM) matrix
7+
% B - (DxN) matrix
8+
% df = 1, force diagonals to be zero; 0 (default), do not force
9+
%
10+
% Returns:
11+
% E - (MxN) Euclidean distances between vectors in A and B
12+
%
13+
%
14+
% Description :
15+
% This fully vectorized (VERY FAST!) m-file computes the
16+
% Euclidean distance between two vectors by:
17+
%
18+
% ||A-B|| = sqrt ( ||A||^2 + ||B||^2 - 2*A.B )
19+
%
20+
% Example :
21+
% A = rand(400,100); B = rand(400,200);
22+
% d = distance(A,B);
23+
24+
% Author : Roland Bunschoten
25+
% University of Amsterdam
26+
% Intelligent Autonomous Systems (IAS) group
27+
% Kruislaan 403 1098 SJ Amsterdam
28+
% tel.(+31)20-5257524
29+
30+
% Last Rev : Wed Oct 20 08:58:08 MET DST 1999
31+
% Tested : PC Matlab v5.2 and Solaris Matlab v5.3
32+
33+
% Copyright notice: You are free to modify, extend and distribute
34+
% this code granted that the author of the original code is
35+
% mentioned as the original author of the code.
36+
37+
% Fixed by JBT (3/18/00) to work for 1-dimensional vectors
38+
% and to warn for imaginary numbers. Also ensures that
39+
% output is all real, and allows the option of forcing diagonals to
40+
% be zero.
41+
42+
if (nargin < 2)
43+
error('Not enough input arguments');
44+
end
45+
46+
if (nargin < 3)
47+
df = 0; % by default, do not force 0 on the diagonal
48+
end
49+
50+
if (size(a,1) ~= size(b,1))
51+
error('A and B should be of same dimensionality');
52+
end
53+
54+
if ~(isreal(a)*isreal(b))
55+
disp('Warning: running distance.m with imaginary numbers. Results may be off.');
56+
end
57+
58+
if (size(a,1) == 1)
59+
a = [a; zeros(1,size(a,2))];
60+
b = [b; zeros(1,size(b,2))];
61+
end
62+
63+
aa=sum(a.*a); bb=sum(b.*b); ab=a'*b;
64+
d = sqrt(repmat(aa',[1 size(bb,2)]) + repmat(bb,[size(aa,2) 1]) - 2*ab);
65+
66+
% make sure result is all real
67+
d = real(d);
68+
69+
% force 0 on the diagonal?
70+
if (df==1)
71+
d = d.*(1-eye(size(d)));
72+
end

pca_classifier.m

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
function [ avg_err_rate ] = pca_classifier( k )
2+
3+
load digits;
4+
5+
[base,mean,data_train] = pcaimg([train2,train3],k);
6+
[base,mean,data_test] = pcaimg([test2,test3],k);
7+
8+
label_train = [ones(1, 300) zeros(1,300); zeros(1,300) ones(1,300) ];
9+
label_test = [ones(1, 300) zeros(1,300); zeros(1,300) ones(1,300) ];
10+
11+
result = knn(1, data_train, label_train, data_test);
12+
total = sum( abs( label_test(1,:) - result(1,:)));
13+
avg_err_rate = total/600;
14+
end
15+

pcaimg.m

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
function [base,mean,projX] = pcaimg(X,k)
2+
%% pca analysis of image data
3+
%% input: X: vectorized image data
4+
%% k: number of eigenvectors you want to keep
5+
%% output: base: eigenvectors
6+
%% mean: mean of data
7+
%% projX: the projected data in the low-dimensional
8+
%% space.
9+
10+
disp('eigendecomposition...');
11+
[xdim,ndata] = size(X);
12+
mean = sum(X,2)/ndata; % compute mean of data
13+
X = X-repmat(mean,1,ndata); % substract the mean
14+
cov = X*X'/ndata; % form the covariance matrix
15+
16+
[ev,ed]=eig(cov); %eigendecomposition
17+
ed = diag(ed);
18+
19+
% Sort eigenvectors by eigenvalues
20+
[foo,p]=sort(-ed);
21+
ed = ed(p);
22+
ev = ev(:,p);
23+
24+
% Take the top k eigenvectors
25+
base = ev(:,1:k);
26+
27+
% project the data into low-dim space
28+
projX = base'*X;

0 commit comments

Comments
 (0)