Introduction

In this project we predict the handwritten digits using decision trees.The MNIST data set is used for this project. We try to reduce the dimension of the data by employing PCA and thereafter algorithm RPart to predict the correct class of the digits.

digit<- read.csv("/Users/neha/Documents/DS-630-ML/pca/mnist.csv")
nrow(digit)
## [1] 59999
digit <- digit[1:20000,]
# Divide data in 80:20 ratio - training:test
samp_size <-floor(0.80* nrow(digit))
train_ind <-sample(seq_len(nrow(digit)), size = samp_size)

# Training data
DATASET.train <- as.data.frame(digit[train_ind,])

# Test Data
DATASET.test <-  as.data.frame(digit[-train_ind,])

Image Visualization

flip <- function(matrix){
    apply(matrix, 2, rev)
}

par(mfrow=c(3,3))
for (i in 1:27){
    dit <- flip(matrix(rev(as.numeric(DATASET.train[i,-c(1, 786)])), nrow = 28)) #look at one digit
    image(dit, col = grey.colors(255))
}

Data Exploration

barplot(table(DATASET.train$X5), main="Total Number of Digits (Training Set)", col=brewer.pal(10,"Set1"),
    xlab="Numbers", ylab = "Frequency of Numbers")
## Warning in brewer.pal(10, "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

barplot(table(DATASET.test$X5), main="Total Number of Digits (Training Set)", col=brewer.pal(10,"Set1"),
    xlab="Numbers", ylab = "Frequency of Numbers")
## Warning in brewer.pal(10, "Set1"): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors

tsne <- Rtsne(DATASET.train[1:300,-1], dims = 2, perplexity=20, verbose=TRUE, max_iter = 500)
## Read the 300 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Normalizing input...
## Building tree...
##  - point 0 of 300
## Done in 0.02 seconds (sparsity = 0.279044)!
## Learning embedding...
## Iteration 50: error is 61.691165 (50 iterations in 0.12 seconds)
## Iteration 100: error is 61.732801 (50 iterations in 0.14 seconds)
## Iteration 150: error is 61.767561 (50 iterations in 0.14 seconds)
## Iteration 200: error is 62.397418 (50 iterations in 0.14 seconds)
## Iteration 250: error is 61.965210 (50 iterations in 0.15 seconds)
## Iteration 300: error is 1.106948 (50 iterations in 0.12 seconds)
## Iteration 350: error is 0.947519 (50 iterations in 0.10 seconds)
## Iteration 400: error is 0.905200 (50 iterations in 0.09 seconds)
## Iteration 450: error is 0.885991 (50 iterations in 0.10 seconds)
## Iteration 500: error is 0.869277 (50 iterations in 0.10 seconds)
## Fitting performed in 1.20 seconds.
colors = rainbow(length(unique(DATASET.train$X5)))
names(colors) = unique(DATASET.train$X5)
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels=DATASET.train$X5, col=colors[DATASET.train$X5])

Principal Component Analysis

features<-digit[,-1]
pca<-princomp(features)
std_dev <- pca[1:260]$sdev
pr_var <- std_dev^2
prop_varex <- pr_var/sum(pr_var)

As seen in the plot, nearly first 260 components explains most variation in the data. so we took first 260 components.

plot(cumsum(prop_varex[1:260]), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     type = "b")

New data after employing PCA

new_digit<-data.frame(number = digit[, "X5"], pca$scores)
new_digit<- new_digit[,1:260]
samp_size <-floor(0.80* nrow(new_digit))
train_ind <-sample(seq_len(nrow(new_digit)), size = samp_size)
train_set <- new_digit[train_ind,]
test_set  <-new_digit[-train_ind,]

RPart

pc <- proc.time()
model.rpart <- rpart(train_set$number ~ .,method = "class", data = train_set)
proc.time() - pc
##    user  system elapsed 
##  31.085   0.155  31.314
printcp(model.rpart)
## 
## Classification tree:
## rpart(formula = train_set$number ~ ., data = train_set, method = "class")
## 
## Variables actually used in tree construction:
## [1] Comp.1 Comp.2 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## 
## Root node error: 14178/16000 = 0.88613
## 
## n= 16000 
## 
##          CP nsplit rel error  xerror      xstd
## 1  0.099238      0   1.00000 1.00000 0.0028340
## 2  0.098321      1   0.90076 0.90330 0.0035657
## 3  0.074552      2   0.80244 0.80406 0.0040379
## 4  0.071978      3   0.72789 0.74235 0.0042328
## 5  0.063126      5   0.58393 0.59056 0.0044560
## 6  0.060587      6   0.52081 0.52292 0.0044488
## 7  0.023134      7   0.46022 0.46643 0.0043933
## 8  0.021160      8   0.43709 0.43913 0.0043498
## 9  0.011849      9   0.41593 0.42249 0.0043177
## 10 0.010000     10   0.40408 0.40718 0.0042845

Accuracy - RPart

prediction.rpart <- predict(model.rpart, newdata = test_set, type = "class")
table(`Actual Class` = test_set$number, `Predicted Class` = prediction.rpart)
##             Predicted Class
## Actual Class   0   1   2   3   4   5   6   7   8   9
##            0 227   0  24  77   0  29   6   5  25   7
##            1   2 405  15   6   0  26   4   0   1   0
##            2   6   5 299  25   1   2  13   1  44   7
##            3   3   5  13 318   2  13   8   0  46   3
##            4   1   5  11   7 203   4   4   5  11 151
##            5  41   2  28  72  16 117  10   2  54  29
##            6   3   3  45  18   0  16 225  12   0  60
##            7   1  16   5   4   8   2   0 238  25 115
##            8   1   0  23  31   3  44   2   1 254  29
##            9   3   5   7   9  26   9   0  22  17 272
error.rate.rpart <- sum(test_set$number != prediction.rpart)/nrow(test_set)
accuracy <- round((1 - error.rate.rpart) *100,2)
accuracy
## [1] 63.95

Tree Visualizations

heat.tree <- function(tree, low.is.green=FALSE, ...) { # dots args passed to prp
y <- model.rpart$frame$yval
if(low.is.green)
y <- -y
max <- max(y)
min <- min(y)
cols <- rainbow(99, end=.36)[
ifelse(y > y[1], (y-y[1]) * (99-50) / (max-y[1]) + 50,
(y-min) * (50-1) / (y[1]-min) + 1)]
prp(model.rpart, branch.col=brewer.pal(10,"Set3"), box.col=brewer.pal(10,"Set3"), ...)
}

heat.tree(model.rpart, type=4, varlen=0, faclen=0, fallen.leaves=TRUE)

draw.tree(model.rpart, cex = 0.5, nodeinfo = TRUE, col = gray(0:8/8))

prp(model.rpart, extra=6, main="Classification (RPART). Tree of Handwritten Digit Recognition ",
box.col=brewer.pal(10,"Set3")[model.rpart$frame$yval])
## Warning: extra=6 but the response has 10 levels (only the 2nd level is
## displayed)

kNN

model.knn <- knn(train = train_set, test = test_set,cl = train_set$number, k=50)
#CrossTable(x = test_set$number, y = model.knn, prop.chisq = FALSE)
table(`Actual Class` = test_set$number, `Predicted Class` = model.knn)
##             Predicted Class
## Actual Class   0   1   2   3   4   5   6   7   8   9
##            0 393   1   0   0   1   1   4   0   0   0
##            1   0 458   1   0   0   0   0   0   0   0
##            2   5  32 344   3   2   2   0  11   2   2
##            3   0  10   2 379   0   7   0   4   4   5
##            4   0  19   0   0 355   0   2   1   0  25
##            5   1   5   1   6   1 342   6   0   1   8
##            6   1   1   0   0   0   2 378   0   0   0
##            7   0  14   1   0   2   0   0 387   0  10
##            8   4  18   0  13   7  12   8   1 311  14
##            9   2   4   0   3   0   0   0  10   1 350
error.rate.knn <- sum(test_set$number != model.knn)/nrow(test_set)
accuracy_knn <- round((1 - error.rate.knn) *100,2)
accuracy_knn
## [1] 92.42

Conclusion

The model RPart performs moderately, as the accuracy results varies between 62%-70%.kNN peforms well, it’s accuracy results vary between 90-95%. Going furhter, Random forests can improve predictive accuracy by generating a large number of bootstrapped trees (based on random samples of variables), classifying a case using each tree in this new “forest”, and deciding a final predicted outcome by combining the results across all of the trees.