Introduction

Data

Data is from here

## read data
readMNIST('data/')
## INFO [2016-02-28 23:27:36] Loading the MNIST data set.
## INFO [2016-02-28 23:27:36] Loading train set with 60000 images.
## INFO [2016-02-28 23:27:42] Generating randomized data set and label matrix
## INFO [2016-02-28 23:27:46] class 0 = 5923 images
## INFO [2016-02-28 23:27:46] class 1 = 6742 images
## INFO [2016-02-28 23:27:46] class 2 = 5958 images
## INFO [2016-02-28 23:27:46] class 3 = 6131 images
## INFO [2016-02-28 23:27:46] class 4 = 5842 images
## INFO [2016-02-28 23:27:46] class 5 = 5421 images
## INFO [2016-02-28 23:27:46] class 6 = 5918 images
## INFO [2016-02-28 23:27:46] class 7 = 6265 images
## INFO [2016-02-28 23:27:46] class 8 = 5851 images
## INFO [2016-02-28 23:27:46] class 9 = 5949 images
## INFO [2016-02-28 23:27:46] Saving the train data (filename=train)
## INFO [2016-02-28 23:27:54] Loading test set with 10000 images.
## INFO [2016-02-28 23:27:55] Generating randomized data set and label matrix
## INFO [2016-02-28 23:27:55] class 0 = 980 images
## INFO [2016-02-28 23:27:55] class 1 = 1135 images
## INFO [2016-02-28 23:27:55] class 2 = 1032 images
## INFO [2016-02-28 23:27:55] class 3 = 1010 images
## INFO [2016-02-28 23:27:55] class 4 = 982 images
## INFO [2016-02-28 23:27:55] class 5 = 892 images
## INFO [2016-02-28 23:27:55] class 6 = 958 images
## INFO [2016-02-28 23:27:55] class 7 = 1028 images
## INFO [2016-02-28 23:27:55] class 8 = 974 images
## INFO [2016-02-28 23:27:55] class 9 = 1009 images
## [1] "Saving the test data (filename=test)"
## INFO [2016-02-28 23:27:57] Finished
## load data
load('data/train.RData')
load('data/test.RData')

Let’s have a brief view of the first 10 digits of MNIST

## plot first 1- digits of MNIST
rotate <- function(x) t(apply(x, 2, rev))
par(mfrow = c(2,5))
for(i in 1:10){
  image(rotate(t(matrix(trainData[i,],28,28))),col=grey.colors(255),xaxt="n",yaxt = "n")
}

Experiment 1

In the first experiment, I implement Principle Component Analysis and OCR for two digits: 2 and 9

## extract digits 2 and 9
target<-cbind(trainLabels,trainData)
target<-target[which(target[,3]==1 | target[,10]==1),]
target.label<-target[,1:10]
X<-target[,11:ncol(target)]

1. XZCVPR:

X

## X
dim(X)
## [1] 11907   784

Z

## Z
u<-colMeans(X)
Z<-sweep(X,2,u,"-")
# validate Z
min(Z)
## [1] -0.9998854
max(Z)
## [1] 0.7009856
par(mfrow = c(1,1))
plot(u,type='l')

C

## C
C<-cov(Z)
# Validate C
dim(C)
## [1] 784 784
image(C,col=grey.colors(255),xaxt="n",yaxt = "n")

V

## V
V<-t(eigen(C)$vectors)
# Validate V
dim(V)
## [1] 784 784

P

## P
P=Z %*% t(V)
# Validate P
dim(P)
## [1] 11907   784

R

## R
# reconstruct with first two principle components
R=P[,1:2] %*% V[1:2,]
R<-sweep(R,2,u,"+")
# Validate P
dim(R)
## [1] 11907   784

Now we can reconstruct the matrix.

2. Comparison

Let’s compare the original images and reconstructed images using first N components. You can slide the button to set N and see the reconstructed images.

Original images:

Reconstruction with first 1 principle component:

Reconstruction with first 50 principle component:

Click here for an interactive visualization of the reconstructed digits. We can see that the reduction does not significantly compromise the identity of the samples.

To further have an insight of how much idnetity each principle component can explain, we can further check the proportion of variance explained for each principle component.

How many principle components do we need to reach 90% proportion of variance explained?

## How Many PCs Contains more than 90% variance?
min(which(cumsum(pve) > 0.9))
## [1] 83

3. OCR

Then we build an OCR algorithm to identify ‘2’ and ‘9’.

## preprocess Principle components and labels
nt <- nrow(X)
Train <- as.data.frame(P)
names(Train)<-c(gsub(" ","",paste('PC',foreach(n=1:ncol(X)) %do% n)))
for (i in 1:nrow(target.label)){
  if (target.label[i,3]==1){
    Train$label[i]<-2
  }
  else {
    Train$label[i]<-9
  }
}

## OCR algorithm
PC1min<-min(Train$PC1)
PC1max<-max(Train$PC1)
PC2min<-min(Train$PC2)
PC2max<-max(Train$PC2)
nbins<-25
H1=matrix(0,nrow=nbins,ncol=nbins)
H2=matrix(0,nrow=nbins,ncol=nbins)

position1 <- function(p) round(1+(nbins-1)*(p[,1]-PC1min)/(PC1max-PC1min))
position2 <- function(p) round(1+(nbins-1)*((p[,2]-PC2min)/(PC2max-PC2min)))

for (i in 1:nrow(P)){
  p<-as.matrix(Train[i,1:2])
  if (Train$label[i]==2){
    H1[position1(p),position2(p)]=H1[position1(p),position2(p)]+1
  }
  else{
    H2[position1(p),position2(p)]=H2[position1(p),position2(p)]+1
  }
}

OCR <- function(x) {
  z<-x-u
  p<-z %*% t(V)
  p<-t(as.matrix(p[,1:2]))
  label<-ifelse(H1[ifelse(position1(p)>0,position1(p),1),ifelse(position2(p)>0,position2(p),1)]>H2[ifelse(position1(p)>0,position1(p),1),ifelse(position2(p)>0,position2(p),1)],2,9)
  return(label)
}

4. Test Accuracy on Test dataset

# test accuracy
test<-cbind(testLabels,testData)
test<-test[which(test[,3]==1 | test[,10]==1),]
test.label<-test[,1:10]
Y<-test[,11:ncol(test)]
result<-data.frame(matrix(0,nrow=nrow(Y),ncol=3),stringsAsFactors=FALSE)
for (i in 1:nrow(Y)){
  if (test.label[i,3]==1){
    result[i,1]<-2
  }
  if (test.label[i,10]==1){
    result[i,1]<-9
  }
  result[i,2]<-OCR(Y[i,])
  if(result[i,1]==result[i,2]){
    result[i,3]<-1
  }
  if(result[i,1]!=result[i,2]){
    result[i,3]<-0
  }
}

# accuracy
sum(result[,3])/nrow(result)
## [1] 0.9652131

5. Visualization

## Scatter plot for first two components
par(mfrow = c(1,1))
ggplot(data = Train, aes(x = PC1, y = PC2)) + 
  geom_point(aes(color = factor(label)), size = 1, alpha = 0.5) + 
  xlab("first principal component") +
  ylab("second principal component")

##  2D histgram for first principle component
ggplot(Train, aes(PC1, fill = as.character(label))) + geom_histogram(alpha = 0.75, aes(y = ..count..), position = 'identity',binwidth=1) + ggtitle("Histogram") + theme(plot.title = element_text(family = "Trebuchet MS", color="#666666", face="bold", size=20))

##  3D histgram for first two principle components
for (i in c(9,2)){
  if (i==9){
    hist3D(z=H2, col='green',border="gray",shade = 0.4,alpha=0.4,xlab='first component',ylab='second component',zlab='count')
  }
  else{
    hist3D(z=H1, col='red',border="gray",shade = 0.4,alpha=0.4,add=TRUE)
  }
}

Experiment 2

Using same logic, I implement Principle Component Analysis and OCR for all ten digits.

1. Comparison

Let’s compare the original images and reconstructed images using first N components. You can slide the button to set N and see the reconstructed images.

Original image:

Reconstruction with first 1 principle component:

Reconstruction with first 50 principle component:

Click here for an interactive visualization of the reconstructed digits. We can see that the reduction does not significantly compromise the identity of the samples

We can further see the proportion of variance explained for each principle component.

How Many PCs Contains more than 90% variance?

## How Many PCs Contains more than 90% variance?
min(which(cumsum(pve) > 0.9))
## [1] 87

2. OCR

## preprocess Principle components and labels
nt <- nrow(X)
Train <- as.data.frame(P)
names(Train)<-c(gsub(" ","",paste('PC',foreach(n=1:ncol(X)) %do% n)))
for (i in 1:nrow(trainLabels)){
  for (j in 1:10){
    if (trainLabels[i,j]==1){
      Train$label[i]<-j-1
    }
  }
}

## OCR algorithm
PC1min<-min(Train$PC1)
PC1max<-max(Train$PC1)
PC2min<-min(Train$PC2)
PC2max<-max(Train$PC2)
nbins<-100
H1=matrix(0,nrow=nbins,ncol=nbins)
H2=matrix(0,nrow=nbins,ncol=nbins)
H3=matrix(0,nrow=nbins,ncol=nbins)
H4=matrix(0,nrow=nbins,ncol=nbins)
H5=matrix(0,nrow=nbins,ncol=nbins)
H6=matrix(0,nrow=nbins,ncol=nbins)
H7=matrix(0,nrow=nbins,ncol=nbins)
H8=matrix(0,nrow=nbins,ncol=nbins)
H9=matrix(0,nrow=nbins,ncol=nbins)
H10=matrix(0,nrow=nbins,ncol=nbins)

for (i in 1:nrow(P)){
  p<-as.matrix(Train[i,1:2])
  if (Train$label[i]==0){
    H1[position1(p),position2(p)]=H1[position1(p),position2(p)]+1
  }
  if (Train$label[i]==1){
    H2[position1(p),position2(p)]=H2[position1(p),position2(p)]+1
  }
  if (Train$label[i]==2){
    H3[position1(p),position2(p)]=H3[position1(p),position2(p)]+1
  }
  if (Train$label[i]==3){
    H4[position1(p),position2(p)]=H4[position1(p),position2(p)]+1
  }
  if (Train$label[i]==4){
    H5[position1(p),position2(p)]=H5[position1(p),position2(p)]+1
  }
  if (Train$label[i]==5){
    H6[position1(p),position2(p)]=H6[position1(p),position2(p)]+1
  }
  if (Train$label[i]==6){
    H7[position1(p),position2(p)]=H7[position1(p),position2(p)]+1
  }
  if (Train$label[i]==7){
    H8[position1(p),position2(p)]=H8[position1(p),position2(p)]+1
  }
  if (Train$label[i]==8){
    H9[position1(p),position2(p)]=H9[position1(p),position2(p)]+1
  }
  if (Train$label[i]==9){
    H10[position1(p),position2(p)]=H10[position1(p),position2(p)]+1
  }
}

OCR <- function(x) {
  z<-x-u
  p<-z %*% t(V)
  p<-t(as.matrix(p[,1:2]))
  pos1<-ifelse(position1(p)>0,position1(p),1)
  pos1<-ifelse(pos1<=nbins,pos1,nbins)
  pos2<-ifelse(position2(p)>0,position2(p),1)
  pos2<-ifelse(pos2<=nbins,pos2,nbins)
  Hs<-c(H1[pos1,pos2],H2[pos1,pos2],H3[pos1,pos2],H4[pos1,pos2],H5[pos1,pos2],H6[pos1,pos2],H7[pos1,pos2],H8[pos1,pos2],H9[pos1,pos2],H10[pos1,pos2])
  label<-which(Hs==max(Hs))-1
  return(label)
}

3. Test Accuracy on Test dataset

# test accuracy
Y<-testData
result<-data.frame(matrix(0,nrow=nrow(Y),ncol=3),stringsAsFactors=FALSE)
for (i in 1:nrow(Y)){
  result[i,1]<-which(testLabels[i,]==1)-1
  if (length(OCR(Y[i,]))==1)
  {
    result[i,2]<-OCR(Y[i,])
    if(result[i,1]==result[i,2]){
      result[i,3]<-1
    }
    if(result[i,1]!=result[i,2]){
      result[i,3]<-0
    }
  }
  else{
    for (k in 1:length(OCR(Y[i,]))){
      result[i,2]<-OCR(Y[i,])[k]
      if(result[i,1]==result[i,2]){
        result[i,3]<-1
      }
    }
  }
}

# accuracy
sum(result[,3])/nrow(result)
## [1] 0.4853

4. Visualization

Scatter plot for first two components

2D histgram for first principle components

3D histgram for first two principle components

Conclusion

By completing each step of Principle Component Analysis and building OCR algorith from scratch, I have practiced and learnt a lot. The future work could be to integrate EM algorithm into OCR. Also I’m thinking about using Random Forest to predict.