Data is from here
## read data
readMNIST('data/')
## INFO [2016-02-28 23:27:36] Loading the MNIST data set.
## INFO [2016-02-28 23:27:36] Loading train set with 60000 images.
## INFO [2016-02-28 23:27:42] Generating randomized data set and label matrix
## INFO [2016-02-28 23:27:46] class 0 = 5923 images
## INFO [2016-02-28 23:27:46] class 1 = 6742 images
## INFO [2016-02-28 23:27:46] class 2 = 5958 images
## INFO [2016-02-28 23:27:46] class 3 = 6131 images
## INFO [2016-02-28 23:27:46] class 4 = 5842 images
## INFO [2016-02-28 23:27:46] class 5 = 5421 images
## INFO [2016-02-28 23:27:46] class 6 = 5918 images
## INFO [2016-02-28 23:27:46] class 7 = 6265 images
## INFO [2016-02-28 23:27:46] class 8 = 5851 images
## INFO [2016-02-28 23:27:46] class 9 = 5949 images
## INFO [2016-02-28 23:27:46] Saving the train data (filename=train)
## INFO [2016-02-28 23:27:54] Loading test set with 10000 images.
## INFO [2016-02-28 23:27:55] Generating randomized data set and label matrix
## INFO [2016-02-28 23:27:55] class 0 = 980 images
## INFO [2016-02-28 23:27:55] class 1 = 1135 images
## INFO [2016-02-28 23:27:55] class 2 = 1032 images
## INFO [2016-02-28 23:27:55] class 3 = 1010 images
## INFO [2016-02-28 23:27:55] class 4 = 982 images
## INFO [2016-02-28 23:27:55] class 5 = 892 images
## INFO [2016-02-28 23:27:55] class 6 = 958 images
## INFO [2016-02-28 23:27:55] class 7 = 1028 images
## INFO [2016-02-28 23:27:55] class 8 = 974 images
## INFO [2016-02-28 23:27:55] class 9 = 1009 images
## [1] "Saving the test data (filename=test)"
## INFO [2016-02-28 23:27:57] Finished
## load data
load('data/train.RData')
load('data/test.RData')
Let’s have a brief view of the first 10 digits of MNIST
## plot first 1- digits of MNIST
rotate <- function(x) t(apply(x, 2, rev))
par(mfrow = c(2,5))
for(i in 1:10){
image(rotate(t(matrix(trainData[i,],28,28))),col=grey.colors(255),xaxt="n",yaxt = "n")
}
In the first experiment, I implement Principle Component Analysis and OCR for two digits: 2 and 9
## extract digits 2 and 9
target<-cbind(trainLabels,trainData)
target<-target[which(target[,3]==1 | target[,10]==1),]
target.label<-target[,1:10]
X<-target[,11:ncol(target)]
## X
dim(X)
## [1] 11907 784
## Z
u<-colMeans(X)
Z<-sweep(X,2,u,"-")
# validate Z
min(Z)
## [1] -0.9998854
max(Z)
## [1] 0.7009856
par(mfrow = c(1,1))
plot(u,type='l')
## C
C<-cov(Z)
# Validate C
dim(C)
## [1] 784 784
image(C,col=grey.colors(255),xaxt="n",yaxt = "n")
## V
V<-t(eigen(C)$vectors)
# Validate V
dim(V)
## [1] 784 784
## P
P=Z %*% t(V)
# Validate P
dim(P)
## [1] 11907 784
## R
# reconstruct with first two principle components
R=P[,1:2] %*% V[1:2,]
R<-sweep(R,2,u,"+")
# Validate P
dim(R)
## [1] 11907 784
Now we can reconstruct the matrix.
Let’s compare the original images and reconstructed images using first N components. You can slide the button to set N and see the reconstructed images.
Original images:
Reconstruction with first 1 principle component:
Reconstruction with first 50 principle component:
Click here for an interactive visualization of the reconstructed digits. We can see that the reduction does not significantly compromise the identity of the samples.
To further have an insight of how much idnetity each principle component can explain, we can further check the proportion of variance explained for each principle component.
How many principle components do we need to reach 90% proportion of variance explained?
## How Many PCs Contains more than 90% variance?
min(which(cumsum(pve) > 0.9))
## [1] 83
Then we build an OCR algorithm to identify ‘2’ and ‘9’.
## preprocess Principle components and labels
nt <- nrow(X)
Train <- as.data.frame(P)
names(Train)<-c(gsub(" ","",paste('PC',foreach(n=1:ncol(X)) %do% n)))
for (i in 1:nrow(target.label)){
if (target.label[i,3]==1){
Train$label[i]<-2
}
else {
Train$label[i]<-9
}
}
## OCR algorithm
PC1min<-min(Train$PC1)
PC1max<-max(Train$PC1)
PC2min<-min(Train$PC2)
PC2max<-max(Train$PC2)
nbins<-25
H1=matrix(0,nrow=nbins,ncol=nbins)
H2=matrix(0,nrow=nbins,ncol=nbins)
position1 <- function(p) round(1+(nbins-1)*(p[,1]-PC1min)/(PC1max-PC1min))
position2 <- function(p) round(1+(nbins-1)*((p[,2]-PC2min)/(PC2max-PC2min)))
for (i in 1:nrow(P)){
p<-as.matrix(Train[i,1:2])
if (Train$label[i]==2){
H1[position1(p),position2(p)]=H1[position1(p),position2(p)]+1
}
else{
H2[position1(p),position2(p)]=H2[position1(p),position2(p)]+1
}
}
OCR <- function(x) {
z<-x-u
p<-z %*% t(V)
p<-t(as.matrix(p[,1:2]))
label<-ifelse(H1[ifelse(position1(p)>0,position1(p),1),ifelse(position2(p)>0,position2(p),1)]>H2[ifelse(position1(p)>0,position1(p),1),ifelse(position2(p)>0,position2(p),1)],2,9)
return(label)
}
# test accuracy
test<-cbind(testLabels,testData)
test<-test[which(test[,3]==1 | test[,10]==1),]
test.label<-test[,1:10]
Y<-test[,11:ncol(test)]
result<-data.frame(matrix(0,nrow=nrow(Y),ncol=3),stringsAsFactors=FALSE)
for (i in 1:nrow(Y)){
if (test.label[i,3]==1){
result[i,1]<-2
}
if (test.label[i,10]==1){
result[i,1]<-9
}
result[i,2]<-OCR(Y[i,])
if(result[i,1]==result[i,2]){
result[i,3]<-1
}
if(result[i,1]!=result[i,2]){
result[i,3]<-0
}
}
# accuracy
sum(result[,3])/nrow(result)
## [1] 0.9652131
## Scatter plot for first two components
par(mfrow = c(1,1))
ggplot(data = Train, aes(x = PC1, y = PC2)) +
geom_point(aes(color = factor(label)), size = 1, alpha = 0.5) +
xlab("first principal component") +
ylab("second principal component")
## 2D histgram for first principle component
ggplot(Train, aes(PC1, fill = as.character(label))) + geom_histogram(alpha = 0.75, aes(y = ..count..), position = 'identity',binwidth=1) + ggtitle("Histogram") + theme(plot.title = element_text(family = "Trebuchet MS", color="#666666", face="bold", size=20))
## 3D histgram for first two principle components
for (i in c(9,2)){
if (i==9){
hist3D(z=H2, col='green',border="gray",shade = 0.4,alpha=0.4,xlab='first component',ylab='second component',zlab='count')
}
else{
hist3D(z=H1, col='red',border="gray",shade = 0.4,alpha=0.4,add=TRUE)
}
}
Using same logic, I implement Principle Component Analysis and OCR for all ten digits.
Let’s compare the original images and reconstructed images using first N components. You can slide the button to set N and see the reconstructed images.
Original image:
Reconstruction with first 1 principle component:
Reconstruction with first 50 principle component:
Click here for an interactive visualization of the reconstructed digits. We can see that the reduction does not significantly compromise the identity of the samples
We can further see the proportion of variance explained for each principle component.
How Many PCs Contains more than 90% variance?
## How Many PCs Contains more than 90% variance?
min(which(cumsum(pve) > 0.9))
## [1] 87
## preprocess Principle components and labels
nt <- nrow(X)
Train <- as.data.frame(P)
names(Train)<-c(gsub(" ","",paste('PC',foreach(n=1:ncol(X)) %do% n)))
for (i in 1:nrow(trainLabels)){
for (j in 1:10){
if (trainLabels[i,j]==1){
Train$label[i]<-j-1
}
}
}
## OCR algorithm
PC1min<-min(Train$PC1)
PC1max<-max(Train$PC1)
PC2min<-min(Train$PC2)
PC2max<-max(Train$PC2)
nbins<-100
H1=matrix(0,nrow=nbins,ncol=nbins)
H2=matrix(0,nrow=nbins,ncol=nbins)
H3=matrix(0,nrow=nbins,ncol=nbins)
H4=matrix(0,nrow=nbins,ncol=nbins)
H5=matrix(0,nrow=nbins,ncol=nbins)
H6=matrix(0,nrow=nbins,ncol=nbins)
H7=matrix(0,nrow=nbins,ncol=nbins)
H8=matrix(0,nrow=nbins,ncol=nbins)
H9=matrix(0,nrow=nbins,ncol=nbins)
H10=matrix(0,nrow=nbins,ncol=nbins)
for (i in 1:nrow(P)){
p<-as.matrix(Train[i,1:2])
if (Train$label[i]==0){
H1[position1(p),position2(p)]=H1[position1(p),position2(p)]+1
}
if (Train$label[i]==1){
H2[position1(p),position2(p)]=H2[position1(p),position2(p)]+1
}
if (Train$label[i]==2){
H3[position1(p),position2(p)]=H3[position1(p),position2(p)]+1
}
if (Train$label[i]==3){
H4[position1(p),position2(p)]=H4[position1(p),position2(p)]+1
}
if (Train$label[i]==4){
H5[position1(p),position2(p)]=H5[position1(p),position2(p)]+1
}
if (Train$label[i]==5){
H6[position1(p),position2(p)]=H6[position1(p),position2(p)]+1
}
if (Train$label[i]==6){
H7[position1(p),position2(p)]=H7[position1(p),position2(p)]+1
}
if (Train$label[i]==7){
H8[position1(p),position2(p)]=H8[position1(p),position2(p)]+1
}
if (Train$label[i]==8){
H9[position1(p),position2(p)]=H9[position1(p),position2(p)]+1
}
if (Train$label[i]==9){
H10[position1(p),position2(p)]=H10[position1(p),position2(p)]+1
}
}
OCR <- function(x) {
z<-x-u
p<-z %*% t(V)
p<-t(as.matrix(p[,1:2]))
pos1<-ifelse(position1(p)>0,position1(p),1)
pos1<-ifelse(pos1<=nbins,pos1,nbins)
pos2<-ifelse(position2(p)>0,position2(p),1)
pos2<-ifelse(pos2<=nbins,pos2,nbins)
Hs<-c(H1[pos1,pos2],H2[pos1,pos2],H3[pos1,pos2],H4[pos1,pos2],H5[pos1,pos2],H6[pos1,pos2],H7[pos1,pos2],H8[pos1,pos2],H9[pos1,pos2],H10[pos1,pos2])
label<-which(Hs==max(Hs))-1
return(label)
}
# test accuracy
Y<-testData
result<-data.frame(matrix(0,nrow=nrow(Y),ncol=3),stringsAsFactors=FALSE)
for (i in 1:nrow(Y)){
result[i,1]<-which(testLabels[i,]==1)-1
if (length(OCR(Y[i,]))==1)
{
result[i,2]<-OCR(Y[i,])
if(result[i,1]==result[i,2]){
result[i,3]<-1
}
if(result[i,1]!=result[i,2]){
result[i,3]<-0
}
}
else{
for (k in 1:length(OCR(Y[i,]))){
result[i,2]<-OCR(Y[i,])[k]
if(result[i,1]==result[i,2]){
result[i,3]<-1
}
}
}
}
# accuracy
sum(result[,3])/nrow(result)
## [1] 0.4853
Scatter plot for first two components
2D histgram for first principle components
3D histgram for first two principle components
By completing each step of Principle Component Analysis and building OCR algorith from scratch, I have practiced and learnt a lot. The future work could be to integrate EM algorithm into OCR. Also I’m thinking about using Random Forest to predict.