#Section 1 Introduction #Briefly describe the classification problem and general data preprocessing.
library(klaR)
## Loading required package: MASS
library(miniUI)
library(rstudioapi)
library(e1071)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(kernlab)
library(lattice)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:kernlab':
##
## alpha
## The following object is masked from 'package:randomForest':
##
## margin
library(caret)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
DTS<-read.csv("Kaggle-digit-train-sample-small-1400.csv", stringsAsFactors = FALSE)
DTS<-as.data.frame(DTS)
DTS[, 1] <- as.factor(DTS[, 1])
set.seed(88977)
indexes<-createDataPartition(DTS$label,times=1, p=0.7,list=FALSE)
Dtrain<-DTS[indexes,]
Dtest<-DTS[-indexes,]
Dtrain_nb<-Dtrain
Dtest_nb<-Dtest
Dtrain_nb[,2:785]<-Dtrain_nb[,2:785]/255.00
Dtest_nb[,2:785]<-Dtest_nb[,2:785]/255.00
model_nb1 <- naiveBayes(Dtrain_nb, Dtrain_nb$label)
#NB model for predicting the test dataset
pred_nb1 <- predict(model_nb1, Dtest_nb)
#Evaluating the model
confusionMatrix(pred_nb1, Dtest_nb$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 38 0 0 0 0 2 0 0 0 1
## 1 0 49 0 5 0 4 0 2 8 3
## 2 0 0 24 0 0 0 2 0 0 0
## 3 0 0 1 31 0 3 0 0 3 0
## 4 0 0 0 0 12 0 1 0 2 0
## 5 0 0 1 0 0 8 1 0 0 0
## 6 1 0 9 0 1 0 36 0 0 0
## 7 0 0 0 0 0 1 0 11 0 1
## 8 3 0 3 2 2 16 3 3 24 0
## 9 0 1 1 4 24 4 0 25 4 38
##
## Overall Statistics
##
## Accuracy : 0.6483
## 95% CI : (0.6004, 0.6941)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6078
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.90476 0.9800 0.61538 0.73810 0.30769 0.21053
## Specificity 0.99202 0.9402 0.99472 0.98138 0.99208 0.99474
## Pos Pred Value 0.92683 0.6901 0.92308 0.81579 0.80000 0.80000
## Neg Pred Value 0.98939 0.9971 0.96173 0.97105 0.93300 0.92647
## Prevalence 0.10048 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.09091 0.1172 0.05742 0.07416 0.02871 0.01914
## Detection Prevalence 0.09809 0.1699 0.06220 0.09091 0.03589 0.02392
## Balanced Accuracy 0.94839 0.9601 0.80505 0.85974 0.64989 0.60263
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.83721 0.26829 0.58537 0.88372
## Specificity 0.97067 0.99469 0.91512 0.83200
## Pos Pred Value 0.76596 0.84615 0.42857 0.37624
## Neg Pred Value 0.98113 0.92593 0.95304 0.98423
## Prevalence 0.10287 0.09809 0.09809 0.10287
## Detection Rate 0.08612 0.02632 0.05742 0.09091
## Detection Prevalence 0.11244 0.03110 0.13397 0.24163
## Balanced Accuracy 0.90394 0.63149 0.75024 0.85786
#The accuracy rate is 0.6483
#section 2 naive Bayes method #Build a naive Bayes model.
start_nb <- Sys.time()
modelKnn_nb2 <- train(label ~ ., data = Dtrain_nb, method = "nb",
trControl = trainControl(method = "none"),
tuneGrid = expand.grid(fL = 1, usekernel = T, adjust = 1))
modelKnn_nb2
## Naive Bayes
##
## 982 samples
## 784 predictors
## 10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
##
## No pre-processing
## Resampling: None
Sys.time() - start_nb
## Time difference of 4.556447 secs
#Time difference of 5.335617 secs
#The optimal parameters for the test dataset
predictKnn_nb2 <- predict(modelKnn_nb2, newdata = Dtest_nb)
confusionMatrix(predictKnn_nb2, Dtest_nb$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 28 0 0 0 0 0 0 0 0 1
## 1 0 24 0 0 0 0 0 0 0 0
## 2 2 1 29 1 1 1 3 2 1 1
## 3 1 0 1 24 0 3 1 0 4 1
## 4 0 1 0 1 35 3 2 5 2 20
## 5 9 0 0 10 1 25 2 0 5 0
## 6 2 1 3 0 2 1 34 0 0 0
## 7 0 18 5 4 0 4 1 34 2 20
## 8 0 5 1 2 0 1 0 0 27 0
## 9 0 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.622
## 95% CI : (0.5736, 0.6687)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5809
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.66667 0.48000 0.74359 0.57143 0.89744 0.65789
## Specificity 0.99734 1.00000 0.96570 0.97074 0.91029 0.92895
## Pos Pred Value 0.96552 1.00000 0.69048 0.68571 0.50725 0.48077
## Neg Pred Value 0.96401 0.93401 0.97340 0.95300 0.98854 0.96448
## Prevalence 0.10048 0.11962 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.06699 0.05742 0.06938 0.05742 0.08373 0.05981
## Detection Prevalence 0.06938 0.05742 0.10048 0.08373 0.16507 0.12440
## Balanced Accuracy 0.83200 0.74000 0.85464 0.77109 0.90386 0.79342
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.79070 0.82927 0.65854 0.0000
## Specificity 0.97600 0.85676 0.97613 1.0000
## Pos Pred Value 0.79070 0.38636 0.75000 NaN
## Neg Pred Value 0.97600 0.97879 0.96335 0.8971
## Prevalence 0.10287 0.09809 0.09809 0.1029
## Detection Rate 0.08134 0.08134 0.06459 0.0000
## Detection Prevalence 0.10287 0.21053 0.08612 0.0000
## Balanced Accuracy 0.88335 0.84302 0.81733 0.5000
#The accuracy rate is 0.622
#section 3 Random Forest method
modelRan_rf<-randomForest(x=Dtrain_nb, y=Dtrain_nb$label)
#Predict for test dataset
predictRan_rf1 <- predict(modelRan_rf, newdata = Dtest_nb)
confusionMatrix(predictRan_rf1, Dtest_nb$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 42 0 0 0 0 0 0 0 0 1
## 1 0 50 1 0 0 0 1 1 0 0
## 2 0 0 33 0 0 0 0 0 0 0
## 3 0 0 1 38 0 0 0 0 3 0
## 4 0 0 1 1 38 1 1 0 1 0
## 5 0 0 0 1 0 36 1 0 0 0
## 6 0 0 1 0 0 0 40 0 0 0
## 7 0 0 2 0 0 0 0 39 0 0
## 8 0 0 0 1 1 1 0 0 36 0
## 9 0 0 0 1 0 0 0 1 1 42
##
## Overall Statistics
##
## Accuracy : 0.9426
## 95% CI : (0.9158, 0.9629)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9361
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 1.0000 1.0000 0.84615 0.90476 0.97436 0.94737
## Specificity 0.9973 0.9918 1.00000 0.98936 0.98681 0.99474
## Pos Pred Value 0.9767 0.9434 1.00000 0.90476 0.88372 0.94737
## Neg Pred Value 1.0000 1.0000 0.98442 0.98936 0.99733 0.99474
## Prevalence 0.1005 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.1005 0.1196 0.07895 0.09091 0.09091 0.08612
## Detection Prevalence 0.1029 0.1268 0.07895 0.10048 0.10287 0.09091
## Balanced Accuracy 0.9987 0.9959 0.92308 0.94706 0.98058 0.97105
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.93023 0.95122 0.87805 0.9767
## Specificity 0.99733 0.99469 0.99204 0.9920
## Pos Pred Value 0.97561 0.95122 0.92308 0.9333
## Neg Pred Value 0.99204 0.99469 0.98681 0.9973
## Prevalence 0.10287 0.09809 0.09809 0.1029
## Detection Rate 0.09569 0.09330 0.08612 0.1005
## Detection Prevalence 0.09809 0.09809 0.09330 0.1077
## Balanced Accuracy 0.96378 0.97296 0.93505 0.9844
#Accuracy rate is 0.9426
#Section 4Support Vector Machine (SVM)
set.seed(15416)
model_svm<- train(label ~ ., data = Dtrain_nb,
method = "svmLinear",
preProcess = c("center", "scale"),
trControl = trainControl(method = "boot", number =9),
tuneGrid = expand.grid(C = seq(0, 1, 0.05)))
model_svm
## Support Vector Machines with Linear Kernel
##
## 982 samples
## 784 predictors
## 10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
##
## Pre-processing: centered (784), scaled (784)
## Resampling: Bootstrapped (9 reps)
## Summary of sample sizes: 982, 982, 982, 982, 982, 982, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.00 NaN NaN
## 0.05 0.8719117 0.8574256
## 0.10 0.8719117 0.8574256
## 0.15 0.8719117 0.8574256
## 0.20 0.8719117 0.8574256
## 0.25 0.8719117 0.8574256
## 0.30 0.8719117 0.8574256
## 0.35 0.8719117 0.8574256
## 0.40 0.8719117 0.8574256
## 0.45 0.8719117 0.8574256
## 0.50 0.8719117 0.8574256
## 0.55 0.8719117 0.8574256
## 0.60 0.8719117 0.8574256
## 0.65 0.8719117 0.8574256
## 0.70 0.8719117 0.8574256
## 0.75 0.8719117 0.8574256
## 0.80 0.8719117 0.8574256
## 0.85 0.8719117 0.8574256
## 0.90 0.8719117 0.8574256
## 0.95 0.8719117 0.8574256
## 1.00 0.8719117 0.8574256
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.05.
#Predict for test dataset
predict_svm <- predict(model_svm, newdata = Dtest_nb)
confusionMatrix(predict_svm, Dtest_nb$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 41 0 0 0 0 1 1 1 0 1
## 1 0 48 2 2 0 2 0 1 1 0
## 2 0 1 29 0 0 0 2 0 0 0
## 3 0 0 2 35 0 1 0 0 4 0
## 4 0 0 2 0 36 2 1 0 1 5
## 5 0 0 0 3 0 28 2 0 2 1
## 6 1 0 3 0 0 0 37 0 0 0
## 7 0 1 1 1 0 1 0 35 0 0
## 8 0 0 0 0 1 2 0 0 33 0
## 9 0 0 0 1 2 1 0 4 0 36
##
## Overall Statistics
##
## Accuracy : 0.8565
## 95% CI : (0.8191, 0.8886)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8403
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.97619 0.9600 0.74359 0.83333 0.92308 0.73684
## Specificity 0.98936 0.9783 0.99208 0.98138 0.97098 0.97895
## Pos Pred Value 0.91111 0.8571 0.90625 0.83333 0.76596 0.77778
## Neg Pred Value 0.99732 0.9945 0.97409 0.98138 0.99191 0.97382
## Prevalence 0.10048 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.09809 0.1148 0.06938 0.08373 0.08612 0.06699
## Detection Prevalence 0.10766 0.1340 0.07656 0.10048 0.11244 0.08612
## Balanced Accuracy 0.98278 0.9691 0.86784 0.90736 0.94703 0.85789
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.86047 0.85366 0.80488 0.83721
## Specificity 0.98933 0.98939 0.99204 0.97867
## Pos Pred Value 0.90244 0.89744 0.91667 0.81818
## Neg Pred Value 0.98408 0.98417 0.97906 0.98128
## Prevalence 0.10287 0.09809 0.09809 0.10287
## Detection Rate 0.08852 0.08373 0.07895 0.08612
## Detection Prevalence 0.09809 0.09330 0.08612 0.10526
## Balanced Accuracy 0.92490 0.92152 0.89846 0.90794
#The accuracy rate is 0.8565
#Section5 #Using the naive Bayes method, Random Forest method and Support Vector Machine # I found naive Bayes method is most accuracy is 0.622. And naive Bayes is also the #fastest. SVM is slowest in this project even I change the time to 9. #NB training process is very fast. Because training Naive Bayes is to calculate some #probabilities and does not require matrix calculation or iterative optimization.