CAP 5610 Extra Credit Danilo Martinez
Bonus credits (5 bonus points added to final score, submit in a separate report) . apply PCA (method A) to classify digit images. . Plot the obtained eigenvectors obtained by PCA (method A + method B)
#Loading
necessary Libraries
library(class)
library(caret)
library(mnist)
library(doParallel)
# Setting up parallel
processing.
cl <-makeCluster(detectCores())
registerDoParallel(cl)
#Fetching the data set
from the minIST website
mnist <- download_mnist()
#Extracting the the first
60k observations
inTrain =head(mnist, 60000)
#Removing Unnecessary
files
rm(mnist)
#Seperating labels in
order to prepare for pca analysis
responseY <- as.factor(inTrain[,dim(inTrain)[2]])
predictorX <- as.matrix(inTrain[,1:(dim(inTrain)[2]-1)])
#Performing Principal
Component Analysis
pca <- princomp(predictorX)
#Plotting Eigenvalues for
all digits
plot(pca,type = "l",main = "Total Digits
Eigenvectors")
#Dividing
the data by digit and performing pca
for (i in 0:9){
data <- inTrain[ which(inTrain$Label==i),]
predictorX <- as.matrix(data[,1:(dim(data)[2]-1)])
#Performing Principal Component
Analysis
pca <- princomp(predictorX)
#Plotting Eigenvalues by
specific digit
cat("Plotting Eigenvectors
Digit = ",
i, "\n", sep = "")
plot(pca,type = "l",main = "Eigenvectors" )
}
## Plotting Eigenvectors Digit = 0
## Plotting Eigenvectors Digit = 1
## Plotting Eigenvectors Digit = 2
## Plotting Eigenvectors Digit = 3
## Plotting Eigenvectors Digit = 4
## Plotting Eigenvectors Digit = 5
## Plotting Eigenvectors Digit = 6
## Plotting Eigenvectors Digit = 7
## Plotting Eigenvectors Digit = 8
## Plotting Eigenvectors Digit = 9
#Dividing
data with training having 50k and testing having 10k.
trainIndex = createDataPartition(inTrain$Label, p=0.833245, list=FALSE)
training = inTrain[trainIndex,]
testing = inTrain[-trainIndex,]
#Removing Unnecessary
files
rm(inTrain)
#Setting up folds
n=10
#Establishing vector for
results
results=rep(0.00,n)
min=1
#Creating for loops to
iterate through folds
for(i in 1:n)
{
count=nrow(training)
filter=rep(TRUE,count)
max=count/n
up=max+min-1
res=list()
for(j in min:up){
filter[j]=FALSE
}
#Seperating the training databy
label
for(k in 0:9){
X=training[filter
& training$Label==k,-ncol(training)]
pca=princomp(X)
testing=training[!filter,-ncol(training)]
changeddata=as.matrix(testing)%*%pca$loadings[,1:154]
rebuilt=(changeddata)%*%t(pca$loadings[,1:154])
error=(rebuilt-testing)^2
error=rowSums(error)
res[[k+1]]=error
}
#Setting up testing label
testingy = training[!filter,ncol(training)]
#Initializing accurate
predictions to zero
accurates=0
#Checking accuracy in testing
data
for(l in 1:length(testingy)){
squarederror=0
prediction=-1
for(m in 1:10){
if(res[[m]][l]<squarederror|squarederror==0){
prediction=m-1
squarederror=res[[m]][l]
}
}
#Checking if prediciton is
accurate
if(prediction==testingy[i])
{
accurates=accurates+1
}
}
#Storing results for digit
results[i]=accurates/sum(!filter)
min=min+max
}
#Printing and plotting results
print("Average Error Rate")
## [1] "Average Error Rate"
mean(results)
## [1] 0.09644
print("Average Standard Deviation")
## [1] "Average Standard Deviation"
sd(results)
## [1] 0.01397579
print("Displaying the Error Rates per Digit")
## [1] "Displaying the Error Rates per Digit"
plot(results,type = "l",main = "Error Rates per
Digit",
xlab = "Digit" )
The eigenvector plots show that the variance held by the principal components is very similar for all digits in the MNIST. They decline similarly and most of the variability is contained by the first 10 eigenvectors.