Part II Digit Recognition

rm(list=ls())
t1<- Sys.time()
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(gbm)
## Loaded gbm 2.1.8
library(nnet)
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
cores<- parallel::detectCores()
registerDoParallel(cores=cores)
library(keras)
library(tensorflow)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
train <- read.csv('./Digit_Data/train.csv',header=T)
plotdigit<- function(datarow, rm1=F){
  # function will produce an image of the data given
  # the function takes away the first value because it is the target
  title<- datarow[1] # get actual value from training data
  if(rm1){
    datarow<- datarow[-1] # remove the actual value
  }
  datarow<- as.numeric(datarow) # convert to numeric
  x<- rep(0:27)/27 
  y<- rep(0:27)/27
  z<- matrix(datarow, ncol=28, byrow=T)
  rotate <- function(x) t(apply(x, 2, rev))
  z<- rotate(z)
  
  image(x,y,z, col=gray.colors(255, start=1, end=0), asp=1,
        xlim=c(0,1), ylim=c(-0.1,1.1), useRaster = T, axes=F, xlab='', ylab='')
}
plotdigit(train[10,],rm1=T)

### Using the training.csv file, plot representations of the first 10 images to understand the data format. Go ahead and divide all pixels by 255 to produce values between 0 and 1. (This is equivalent to min-max scaling.)

par(mfrow=c(2,5))
for(i in 1:10){
  plotdigit(train[i,],rm1=T)
}

### Scaled Data

scaled_var <- train[,-1]/255
par(mfrow=c(2,5))

for(i in 1:10){
  plotdigit(scaled_var[i,],rm1=T)
}

### Frequency

ggplot(train,aes(x=as.factor(label),fill=label))+
  geom_bar(stat="count",color="white")+
  geom_text(stat='count', aes(label=..count..), vjust=+3)+
  scale_fill_gradient(low="blue",high="green")+
  labs(title="Digits in Train Data",x="Digits")

mypca <-prcomp(scaled_var)

95% of variance, 154 components

It took 154 components to break the 95% variance mark.

for(i in 1:ncol(train)){
  pov <-sum((mypca$sdev[1:i]^2))/sum(mypca$sdev^2)
  if(pov >= .95){
    print(paste0(i,' ',pov))
    break
  }
}
## [1] "154 0.950433238263376"

100 of variance, 704 components

It took 704 components to break the 100% variance mark. I would assume that the remaining components don’t have anything different in them. That is why there are only 704 components that account for all the variance.

for(i in 1:ncol(train)){
  pov <-sum((mypca$sdev[1:i]^2))/sum(mypca$sdev^2)
  if(pov == 1){
    print(paste0(i,' ',pov))
    break
  }
}
## [1] "704 1"

Plot the first 10 images generated by PCA. They will appear to be noise. Why? (5 points)

I believe it is all noise because this is made up a mix of all the numbers.

f_ten <- mypca$x%*%t(mypca$rotation)+mypca$center
image(f_ten[,1:10])

Now, select only those images that have labels that are 8’s. Re-run PCA that accounts for all of the variance (100%). Plot the first 10 images. What do you see?

I see a darker image similar to what I saw with the first ten of all the numbers.

eights <- train %>% filter(label == 8)
eights <- eights[,2:ncol(eights)]
eights_scaled <- eights/255
eights_pca <- prcomp(eights_scaled)
for(i in 1:ncol(eights)){
  pov <-sum((eights_pca$sdev[1:i]^2))/sum(eights_pca$sdev^2)
  if(pov == 1){
    print(paste0(i,' ',pov))
    break
  }
}
## [1] "537 1"
image(eights_pca$x%*%t(eights_pca$rotation)+eights_pca$center)

An incorrect approach to predicting the images would be to build a linear regression model with y as the digit values and X as the pixel matrix. Instead, we can build a multinomial model that classifies the digits. Build a multinomial model on the entirety of the training set. Then provide its classification accuracy (percent correctly identified) as well as a matrix of observed versus forecast values (confusion matrix). This matrix will be a 10 x 10, and correct classifications will be on the diagonal. (10 points)

nrows_train<- nrow(train)*0.75 
set.seed(5465485)
rows_train<- sample(1:nrow(train),nrows_train)

data_train<- train[rows_train,]
data_test<- train[-rows_train,]

y<- as.numeric(data_train[,1])
X<- data_train[, -1]

var_cols<- apply(data_train, 2, 'var')
no_var_cols<- which(var_cols==0)

X<- data_train[ , -c(no_var_cols, 1)]

pca_svd<- prcomp(X, center=F, scale=F)
X_cov<- cov(X)
pca_cov<- prcomp(X_cov)
pca_cov_percent_var<- pca_cov$sdev^2/sum(pca_cov$sdev^2)
X_loading<- pca_cov$rotation #Loadings from the PCA


X_train<- as.matrix(data_train[, -c(no_var_cols,1)]/max(data_train[,-1]))
X_train<- X_train %*% X_loading[,1:50]
y_train<- data_train[,'label']

# do the same to the holdout set
X_test<- as.matrix(data_test[ ,-c(no_var_cols,1)]/max(data_train[,-1]))
X_test<- X_test %*% X_loading[,1:50]
y_test<- data_test[,'label']


df.train<- data.frame(y=as.factor(y_train), X=X_train/255)
df.test<- data.frame(y=as.factor(y_test), X=X_test/255)
model_nnet1<- nnet(y~., data=df.train,size=250, MaxNWts=100000, maxit=1000)
## # weights:  15260
## initial  value 109470.589920 
## iter  10 value 76820.843000
## iter  20 value 72249.492476
## iter  30 value 71614.058817
## iter  40 value 69602.847207
## iter  50 value 65121.562804
## iter  60 value 59113.468307
## iter  70 value 49148.323303
## iter  80 value 42999.115939
## iter  90 value 34416.032805
## iter 100 value 27023.360196
## iter 110 value 21543.070942
## iter 120 value 18309.366391
## iter 130 value 15708.956633
## iter 140 value 13906.560633
## iter 150 value 12525.465631
## iter 160 value 11541.303476
## iter 170 value 10808.252490
## iter 180 value 10454.462293
## iter 190 value 10169.287713
## iter 200 value 9900.465475
## iter 210 value 9715.604684
## iter 220 value 9607.370369
## iter 230 value 9468.343161
## iter 240 value 9311.773016
## iter 250 value 9177.303073
## iter 260 value 8893.694631
## iter 270 value 8676.387485
## iter 280 value 8413.118456
## iter 290 value 8044.167337
## iter 300 value 7532.316634
## iter 310 value 7041.451903
## iter 320 value 6525.558760
## iter 330 value 6067.007650
## iter 340 value 5677.762728
## iter 350 value 5279.908655
## iter 360 value 4982.873051
## iter 370 value 4668.023911
## iter 380 value 4306.585320
## iter 390 value 3947.470119
## iter 400 value 3609.799024
## iter 410 value 3344.953722
## iter 420 value 3033.157385
## iter 430 value 2791.404654
## iter 440 value 2562.281810
## iter 450 value 2342.816180
## iter 460 value 2124.672651
## iter 470 value 1868.910855
## iter 480 value 1632.766063
## iter 490 value 1390.536487
## iter 500 value 1182.232985
## iter 510 value 975.643204
## iter 520 value 792.311094
## iter 530 value 607.908515
## iter 540 value 435.277347
## iter 550 value 286.646576
## iter 560 value 171.813830
## iter 570 value 95.598155
## iter 580 value 30.961702
## iter 590 value 11.664114
## iter 600 value 5.430211
## iter 610 value 2.866294
## iter 620 value 1.713742
## iter 630 value 1.119423
## iter 640 value 0.768320
## iter 650 value 0.534799
## iter 660 value 0.391785
## iter 670 value 0.289408
## iter 680 value 0.219501
## iter 690 value 0.177276
## iter 700 value 0.144392
## iter 710 value 0.121584
## iter 720 value 0.102887
## iter 730 value 0.088378
## iter 740 value 0.075142
## iter 750 value 0.065713
## iter 760 value 0.055225
## iter 770 value 0.048735
## iter 780 value 0.043931
## iter 790 value 0.037243
## iter 800 value 0.031775
## iter 810 value 0.028938
## iter 820 value 0.025982
## iter 830 value 0.021580
## iter 840 value 0.018571
## iter 850 value 0.017635
## iter 860 value 0.014818
## iter 870 value 0.012852
## iter 880 value 0.012006
## iter 890 value 0.010448
## iter 900 value 0.009930
## iter 910 value 0.008539
## iter 920 value 0.008276
## iter 930 value 0.006799
## iter 940 value 0.006633
## iter 950 value 0.005726
## iter 960 value 0.005626
## iter 970 value 0.004772
## iter 980 value 0.004698
## iter 990 value 0.003967
## iter1000 value 0.003866
## final  value 0.003866 
## stopped after 1000 iterations
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:tensorflow':
## 
##     train
## The following object is masked from 'package:purrr':
## 
##     lift
y_pred_test<- predict(model_nnet1,df.test,type ='class')

confusionMatrix(as.factor(y_pred_test),df.test$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0  976    0    5    0    2    2    8    2    0    4
##          1    0 1157    6    1    3    0    1    1    4    3
##          2    2    1 1015   10    1    1    6   15   10    2
##          3    1    0    9 1046    0    9    0    5   13    8
##          4    2    1    2    3 1018    1    8    3    4   11
##          5    0    1    1    9    0  905    2    1    8    5
##          6    2    1    4    0    4    6  975    0    2    0
##          7    2    5    7    5    6    3    1 1088    5   10
##          8    2    1    6    9    2    5    3    3  952    6
##          9    8    1    3    8   17   11    0   18    9  996
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9646         
##                  95% CI : (0.9609, 0.968)
##     No Information Rate : 0.1112         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9606         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.98090   0.9906  0.95936  0.95875  0.96676  0.95970
## Specificity           0.99758   0.9980  0.99492  0.99522  0.99630  0.99717
## Pos Pred Value        0.97698   0.9838  0.95484  0.95875  0.96676  0.97103
## Neg Pred Value        0.99800   0.9988  0.99544  0.99522  0.99630  0.99603
## Prevalence            0.09476   0.1112  0.10076  0.10390  0.10029  0.08981
## Detection Rate        0.09295   0.1102  0.09667  0.09962  0.09695  0.08619
## Detection Prevalence  0.09514   0.1120  0.10124  0.10390  0.10029  0.08876
## Balanced Accuracy     0.98924   0.9943  0.97714  0.97699  0.98153  0.97844
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.97112   0.9577  0.94538  0.95311
## Specificity           0.99800   0.9953  0.99610  0.99207
## Pos Pred Value        0.98089   0.9611  0.96259  0.92997
## Neg Pred Value        0.99695   0.9949  0.99422  0.99480
## Prevalence            0.09562   0.1082  0.09590  0.09952
## Detection Rate        0.09286   0.1036  0.09067  0.09486
## Detection Prevalence  0.09467   0.1078  0.09419  0.10200
## Balanced Accuracy     0.98456   0.9765  0.97074  0.97259