Digit Recognition with NBC, Random Forest, and SVM

#Section 1 Introduction #Briefly describe the classification problem and general data preprocessing.

library(klaR)

## Loading required package: MASS

library(miniUI)
library(rstudioapi)
library(e1071)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(kernlab)
library(lattice)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:kernlab':
## 
##     alpha

## The following object is masked from 'package:randomForest':
## 
##     margin

library(caret)
library(boot)

## 
## Attaching package: 'boot'

## The following object is masked from 'package:lattice':
## 
##     melanoma

DTS<-read.csv("Kaggle-digit-train-sample-small-1400.csv", stringsAsFactors = FALSE)
DTS<-as.data.frame(DTS)
DTS[, 1] <- as.factor(DTS[, 1]) 

set.seed(88977)
indexes<-createDataPartition(DTS$label,times=1, p=0.7,list=FALSE)
Dtrain<-DTS[indexes,]
Dtest<-DTS[-indexes,]

Dtrain_nb<-Dtrain
Dtest_nb<-Dtest
Dtrain_nb[,2:785]<-Dtrain_nb[,2:785]/255.00
Dtest_nb[,2:785]<-Dtest_nb[,2:785]/255.00

model_nb1 <- naiveBayes(Dtrain_nb, Dtrain_nb$label)

#NB model for predicting the test dataset
pred_nb1 <- predict(model_nb1, Dtest_nb)

#Evaluating the model
confusionMatrix(pred_nb1, Dtest_nb$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6  7  8  9
##          0 38  0  0  0  0  2  0  0  0  1
##          1  0 49  0  5  0  4  0  2  8  3
##          2  0  0 24  0  0  0  2  0  0  0
##          3  0  0  1 31  0  3  0  0  3  0
##          4  0  0  0  0 12  0  1  0  2  0
##          5  0  0  1  0  0  8  1  0  0  0
##          6  1  0  9  0  1  0 36  0  0  0
##          7  0  0  0  0  0  1  0 11  0  1
##          8  3  0  3  2  2 16  3  3 24  0
##          9  0  1  1  4 24  4  0 25  4 38
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6483          
##                  95% CI : (0.6004, 0.6941)
##     No Information Rate : 0.1196          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6078          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.90476   0.9800  0.61538  0.73810  0.30769  0.21053
## Specificity           0.99202   0.9402  0.99472  0.98138  0.99208  0.99474
## Pos Pred Value        0.92683   0.6901  0.92308  0.81579  0.80000  0.80000
## Neg Pred Value        0.98939   0.9971  0.96173  0.97105  0.93300  0.92647
## Prevalence            0.10048   0.1196  0.09330  0.10048  0.09330  0.09091
## Detection Rate        0.09091   0.1172  0.05742  0.07416  0.02871  0.01914
## Detection Prevalence  0.09809   0.1699  0.06220  0.09091  0.03589  0.02392
## Balanced Accuracy     0.94839   0.9601  0.80505  0.85974  0.64989  0.60263
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.83721  0.26829  0.58537  0.88372
## Specificity           0.97067  0.99469  0.91512  0.83200
## Pos Pred Value        0.76596  0.84615  0.42857  0.37624
## Neg Pred Value        0.98113  0.92593  0.95304  0.98423
## Prevalence            0.10287  0.09809  0.09809  0.10287
## Detection Rate        0.08612  0.02632  0.05742  0.09091
## Detection Prevalence  0.11244  0.03110  0.13397  0.24163
## Balanced Accuracy     0.90394  0.63149  0.75024  0.85786

#The accuracy rate is  0.6483

#section 2 naive Bayes method #Build a naive Bayes model.

start_nb <- Sys.time()
modelKnn_nb2 <- train(label ~ ., data = Dtrain_nb, method = "nb",
                   trControl = trainControl(method = "none"),
                   tuneGrid = expand.grid(fL = 1, usekernel = T, adjust = 1))
modelKnn_nb2

## Naive Bayes 
## 
## 982 samples
## 784 predictors
##  10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' 
## 
## No pre-processing
## Resampling: None

Sys.time() - start_nb

## Time difference of 4.556447 secs

#Time difference of 5.335617 secs

#The optimal parameters for the test dataset
predictKnn_nb2 <- predict(modelKnn_nb2, newdata = Dtest_nb)
confusionMatrix(predictKnn_nb2, Dtest_nb$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6  7  8  9
##          0 28  0  0  0  0  0  0  0  0  1
##          1  0 24  0  0  0  0  0  0  0  0
##          2  2  1 29  1  1  1  3  2  1  1
##          3  1  0  1 24  0  3  1  0  4  1
##          4  0  1  0  1 35  3  2  5  2 20
##          5  9  0  0 10  1 25  2  0  5  0
##          6  2  1  3  0  2  1 34  0  0  0
##          7  0 18  5  4  0  4  1 34  2 20
##          8  0  5  1  2  0  1  0  0 27  0
##          9  0  0  0  0  0  0  0  0  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.622           
##                  95% CI : (0.5736, 0.6687)
##     No Information Rate : 0.1196          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5809          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.66667  0.48000  0.74359  0.57143  0.89744  0.65789
## Specificity           0.99734  1.00000  0.96570  0.97074  0.91029  0.92895
## Pos Pred Value        0.96552  1.00000  0.69048  0.68571  0.50725  0.48077
## Neg Pred Value        0.96401  0.93401  0.97340  0.95300  0.98854  0.96448
## Prevalence            0.10048  0.11962  0.09330  0.10048  0.09330  0.09091
## Detection Rate        0.06699  0.05742  0.06938  0.05742  0.08373  0.05981
## Detection Prevalence  0.06938  0.05742  0.10048  0.08373  0.16507  0.12440
## Balanced Accuracy     0.83200  0.74000  0.85464  0.77109  0.90386  0.79342
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.79070  0.82927  0.65854   0.0000
## Specificity           0.97600  0.85676  0.97613   1.0000
## Pos Pred Value        0.79070  0.38636  0.75000      NaN
## Neg Pred Value        0.97600  0.97879  0.96335   0.8971
## Prevalence            0.10287  0.09809  0.09809   0.1029
## Detection Rate        0.08134  0.08134  0.06459   0.0000
## Detection Prevalence  0.10287  0.21053  0.08612   0.0000
## Balanced Accuracy     0.88335  0.84302  0.81733   0.5000

#The accuracy rate is 0.622

#section 3 Random Forest method

modelRan_rf<-randomForest(x=Dtrain_nb, y=Dtrain_nb$label)

#Predict for test dataset
predictRan_rf1 <- predict(modelRan_rf, newdata = Dtest_nb)
confusionMatrix(predictRan_rf1, Dtest_nb$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6  7  8  9
##          0 42  0  0  0  0  0  0  0  0  1
##          1  0 50  1  0  0  0  1  1  0  0
##          2  0  0 33  0  0  0  0  0  0  0
##          3  0  0  1 38  0  0  0  0  3  0
##          4  0  0  1  1 38  1  1  0  1  0
##          5  0  0  0  1  0 36  1  0  0  0
##          6  0  0  1  0  0  0 40  0  0  0
##          7  0  0  2  0  0  0  0 39  0  0
##          8  0  0  0  1  1  1  0  0 36  0
##          9  0  0  0  1  0  0  0  1  1 42
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9426          
##                  95% CI : (0.9158, 0.9629)
##     No Information Rate : 0.1196          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9361          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            1.0000   1.0000  0.84615  0.90476  0.97436  0.94737
## Specificity            0.9973   0.9918  1.00000  0.98936  0.98681  0.99474
## Pos Pred Value         0.9767   0.9434  1.00000  0.90476  0.88372  0.94737
## Neg Pred Value         1.0000   1.0000  0.98442  0.98936  0.99733  0.99474
## Prevalence             0.1005   0.1196  0.09330  0.10048  0.09330  0.09091
## Detection Rate         0.1005   0.1196  0.07895  0.09091  0.09091  0.08612
## Detection Prevalence   0.1029   0.1268  0.07895  0.10048  0.10287  0.09091
## Balanced Accuracy      0.9987   0.9959  0.92308  0.94706  0.98058  0.97105
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.93023  0.95122  0.87805   0.9767
## Specificity           0.99733  0.99469  0.99204   0.9920
## Pos Pred Value        0.97561  0.95122  0.92308   0.9333
## Neg Pred Value        0.99204  0.99469  0.98681   0.9973
## Prevalence            0.10287  0.09809  0.09809   0.1029
## Detection Rate        0.09569  0.09330  0.08612   0.1005
## Detection Prevalence  0.09809  0.09809  0.09330   0.1077
## Balanced Accuracy     0.96378  0.97296  0.93505   0.9844

#Accuracy rate is 0.9426

#Section 4Support Vector Machine (SVM)

set.seed(15416)
model_svm<- train(label ~ ., data = Dtrain_nb,
                          method = "svmLinear",
                          preProcess = c("center", "scale"),
                          trControl = trainControl(method = "boot", number =9),
                          tuneGrid = expand.grid(C = seq(0, 1, 0.05)))
model_svm

## Support Vector Machines with Linear Kernel 
## 
## 982 samples
## 784 predictors
##  10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' 
## 
## Pre-processing: centered (784), scaled (784) 
## Resampling: Bootstrapped (9 reps) 
## Summary of sample sizes: 982, 982, 982, 982, 982, 982, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.00        NaN        NaN
##   0.05  0.8719117  0.8574256
##   0.10  0.8719117  0.8574256
##   0.15  0.8719117  0.8574256
##   0.20  0.8719117  0.8574256
##   0.25  0.8719117  0.8574256
##   0.30  0.8719117  0.8574256
##   0.35  0.8719117  0.8574256
##   0.40  0.8719117  0.8574256
##   0.45  0.8719117  0.8574256
##   0.50  0.8719117  0.8574256
##   0.55  0.8719117  0.8574256
##   0.60  0.8719117  0.8574256
##   0.65  0.8719117  0.8574256
##   0.70  0.8719117  0.8574256
##   0.75  0.8719117  0.8574256
##   0.80  0.8719117  0.8574256
##   0.85  0.8719117  0.8574256
##   0.90  0.8719117  0.8574256
##   0.95  0.8719117  0.8574256
##   1.00  0.8719117  0.8574256
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.05.

#Predict for test dataset
predict_svm <- predict(model_svm, newdata = Dtest_nb)
confusionMatrix(predict_svm, Dtest_nb$label)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1  2  3  4  5  6  7  8  9
##          0 41  0  0  0  0  1  1  1  0  1
##          1  0 48  2  2  0  2  0  1  1  0
##          2  0  1 29  0  0  0  2  0  0  0
##          3  0  0  2 35  0  1  0  0  4  0
##          4  0  0  2  0 36  2  1  0  1  5
##          5  0  0  0  3  0 28  2  0  2  1
##          6  1  0  3  0  0  0 37  0  0  0
##          7  0  1  1  1  0  1  0 35  0  0
##          8  0  0  0  0  1  2  0  0 33  0
##          9  0  0  0  1  2  1  0  4  0 36
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8565          
##                  95% CI : (0.8191, 0.8886)
##     No Information Rate : 0.1196          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8403          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.97619   0.9600  0.74359  0.83333  0.92308  0.73684
## Specificity           0.98936   0.9783  0.99208  0.98138  0.97098  0.97895
## Pos Pred Value        0.91111   0.8571  0.90625  0.83333  0.76596  0.77778
## Neg Pred Value        0.99732   0.9945  0.97409  0.98138  0.99191  0.97382
## Prevalence            0.10048   0.1196  0.09330  0.10048  0.09330  0.09091
## Detection Rate        0.09809   0.1148  0.06938  0.08373  0.08612  0.06699
## Detection Prevalence  0.10766   0.1340  0.07656  0.10048  0.11244  0.08612
## Balanced Accuracy     0.98278   0.9691  0.86784  0.90736  0.94703  0.85789
##                      Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity           0.86047  0.85366  0.80488  0.83721
## Specificity           0.98933  0.98939  0.99204  0.97867
## Pos Pred Value        0.90244  0.89744  0.91667  0.81818
## Neg Pred Value        0.98408  0.98417  0.97906  0.98128
## Prevalence            0.10287  0.09809  0.09809  0.10287
## Detection Rate        0.08852  0.08373  0.07895  0.08612
## Detection Prevalence  0.09809  0.09330  0.08612  0.10526
## Balanced Accuracy     0.92490  0.92152  0.89846  0.90794

#The accuracy rate is 0.8565

#Section5 #Using the naive Bayes method, Random Forest method and Support Vector Machine # I found naive Bayes method is most accuracy is 0.622. And naive Bayes is also the #fastest. SVM is slowest in this project even I change the time to 9. #NB training process is very fast. Because training Naive Bayes is to calculate some #probabilities and does not require matrix calculation or iterative optimization.

Digit Recognition with NBC, Random Forest, and SVM

Jing Wang