#install.packages("klaR")
#install.packages("miniUI")
#install.packages("rstudioapi")
#installed.packages("kernlab")
#install.packages("boot")
library(klaR)
library(miniUI)
library(rstudioapi)
library(e1071)
library(randomForest)
library(kernlab)
library(caret)
library(boot)
#train dataset
digit<-read.csv("Kaggle-digit-train-sample-small-1400.csv", stringsAsFactors = FALSE)
digit<-as.data.frame(digit)
#Pre-process label as factor in train dataset
digit[, 1] <- as.factor(digit[, 1]) # As Category
#Data sampling
set.seed(88977)
indexes<-createDataPartition(digit$label,times=1, p=0.7,list=FALSE)
train<-digit[indexes,]
test<-digit[-indexes,]
#Data scaling into 0-1 scale
train_nb<-train
test_nb<-test
train_nb[,2:785]<-train_nb[,2:785]/255.00
test_nb[,2:785]<-test_nb[,2:785]/255.00
#NB model for train data set
model_nb1 <- naiveBayes(train_nb, train_nb$label)
#NB model for predicting the test dataset
pred_nb1 <- predict(model_nb1, test_nb)
#Evaluating the model
confusionMatrix(pred_nb1, test_nb$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 38 0 0 0 0 2 0 0 0 1
## 1 0 49 0 5 0 4 0 2 8 3
## 2 0 0 24 0 0 0 2 0 0 0
## 3 0 0 1 31 0 3 0 0 3 0
## 4 0 0 0 0 12 0 1 0 2 0
## 5 0 0 1 0 0 8 1 0 0 0
## 6 1 0 9 0 1 0 36 0 0 0
## 7 0 0 0 0 0 1 0 11 0 1
## 8 3 0 3 2 2 16 3 3 24 0
## 9 0 1 1 4 24 4 0 25 4 38
##
## Overall Statistics
##
## Accuracy : 0.6483
## 95% CI : (0.6004, 0.6941)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6078
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.90476 0.9800 0.61538 0.73810 0.30769 0.21053
## Specificity 0.99202 0.9402 0.99472 0.98138 0.99208 0.99474
## Pos Pred Value 0.92683 0.6901 0.92308 0.81579 0.80000 0.80000
## Neg Pred Value 0.98939 0.9971 0.96173 0.97105 0.93300 0.92647
## Prevalence 0.10048 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.09091 0.1172 0.05742 0.07416 0.02871 0.01914
## Detection Prevalence 0.09809 0.1699 0.06220 0.09091 0.03589 0.02392
## Balanced Accuracy 0.94839 0.9601 0.80505 0.85974 0.64989 0.60263
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.83721 0.26829 0.58537 0.88372
## Specificity 0.97067 0.99469 0.91512 0.83200
## Pos Pred Value 0.76596 0.84615 0.42857 0.37624
## Neg Pred Value 0.98113 0.92593 0.95304 0.98423
## Prevalence 0.10287 0.09809 0.09809 0.10287
## Detection Rate 0.08612 0.02632 0.05742 0.09091
## Detection Prevalence 0.11244 0.03110 0.13397 0.24163
## Balanced Accuracy 0.90394 0.63149 0.75024 0.85786
#The accuracy rate is 0.6435
start_nb <- Sys.time()
model_nb2 <- train(label ~ ., data = train_nb, method = "nb",
trControl = trainControl(method = "none"),
tuneGrid = expand.grid(fL = 1, usekernel = T, adjust = 1))
model_nb2
## Naive Bayes
##
## 982 samples
## 784 predictors
## 10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
##
## No pre-processing
## Resampling: None
Sys.time() - start_nb
## Time difference of 6.697135 secs
#Time difference of 4.718463 secs
#The optimal parameters for the test dataset
predict_nb2 <- predict(model_nb2, newdata = test_nb)
confusionMatrix(predict_nb2, test_nb$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 30 0 0 0 0 0 0 0 0 1
## 1 0 24 0 0 0 0 0 0 0 0
## 2 1 1 28 1 1 1 3 2 1 1
## 3 1 0 3 26 0 3 1 0 5 1
## 4 0 1 0 1 35 3 2 5 2 21
## 5 8 0 0 9 0 25 2 0 5 0
## 6 2 1 3 0 2 1 34 0 0 0
## 7 0 18 4 3 1 4 1 34 1 19
## 8 0 5 1 2 0 1 0 0 27 0
## 9 0 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6292
## 95% CI : (0.5809, 0.6756)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5888
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.71429 0.48000 0.71795 0.61905 0.89744 0.65789
## Specificity 0.99734 1.00000 0.96834 0.96277 0.90765 0.93684
## Pos Pred Value 0.96774 1.00000 0.70000 0.65000 0.50000 0.51020
## Neg Pred Value 0.96899 0.93401 0.97090 0.95767 0.98851 0.96477
## Prevalence 0.10048 0.11962 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.07177 0.05742 0.06699 0.06220 0.08373 0.05981
## Detection Prevalence 0.07416 0.05742 0.09569 0.09569 0.16746 0.11722
## Balanced Accuracy 0.85581 0.74000 0.84314 0.79091 0.90254 0.79737
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.79070 0.82927 0.65854 0.0000
## Specificity 0.97600 0.86472 0.97613 1.0000
## Pos Pred Value 0.79070 0.40000 0.75000 NaN
## Neg Pred Value 0.97600 0.97898 0.96335 0.8971
## Prevalence 0.10287 0.09809 0.09809 0.1029
## Detection Rate 0.08134 0.08134 0.06459 0.0000
## Detection Prevalence 0.10287 0.20335 0.08612 0.0000
## Balanced Accuracy 0.88335 0.84699 0.81733 0.5000
#The accuracy rate is 0.6651
#Train data set
model_rf<-randomForest(x=train_nb, y=train_nb$label)
# OOB estimate of error rate: 6.21%
#Predict for test dataset
predict_rf1 <- predict(model_rf, newdata = test_nb)
confusionMatrix(predict_rf1, test_nb$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 42 0 0 0 0 0 0 0 0 1
## 1 0 50 1 0 0 0 1 1 0 0
## 2 0 0 33 0 0 0 0 0 0 0
## 3 0 0 1 38 0 0 0 0 3 0
## 4 0 0 1 1 38 1 1 0 1 0
## 5 0 0 0 1 0 36 1 0 0 0
## 6 0 0 1 0 0 0 40 0 0 0
## 7 0 0 2 0 0 0 0 39 0 0
## 8 0 0 0 1 1 1 0 0 36 0
## 9 0 0 0 1 0 0 0 1 1 42
##
## Overall Statistics
##
## Accuracy : 0.9426
## 95% CI : (0.9158, 0.9629)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9361
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 1.0000 1.0000 0.84615 0.90476 0.97436 0.94737
## Specificity 0.9973 0.9918 1.00000 0.98936 0.98681 0.99474
## Pos Pred Value 0.9767 0.9434 1.00000 0.90476 0.88372 0.94737
## Neg Pred Value 1.0000 1.0000 0.98442 0.98936 0.99733 0.99474
## Prevalence 0.1005 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.1005 0.1196 0.07895 0.09091 0.09091 0.08612
## Detection Prevalence 0.1029 0.1268 0.07895 0.10048 0.10287 0.09091
## Balanced Accuracy 0.9987 0.9959 0.92308 0.94706 0.98058 0.97105
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.93023 0.95122 0.87805 0.9767
## Specificity 0.99733 0.99469 0.99204 0.9920
## Pos Pred Value 0.97561 0.95122 0.92308 0.9333
## Neg Pred Value 0.99204 0.99469 0.98681 0.9973
## Prevalence 0.10287 0.09809 0.09809 0.1029
## Detection Rate 0.09569 0.09330 0.08612 0.1005
## Detection Prevalence 0.09809 0.09809 0.09330 0.1077
## Balanced Accuracy 0.96378 0.97296 0.93505 0.9844
#Accuracy rate is 0.945
#train data set
set.seed(15416)
model_svm_linear <- train(label ~ ., data = train_nb,
method = "svmLinear",
preProcess = c("center", "scale"),
trControl = trainControl(method = "boot", number =25),
tuneGrid = expand.grid(C = seq(0, 1, 0.05)))
model_svm_linear
## Support Vector Machines with Linear Kernel
##
## 982 samples
## 784 predictors
## 10 classes: '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
##
## Pre-processing: centered (784), scaled (784)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 982, 982, 982, 982, 982, 982, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.00 NaN NaN
## 0.05 0.8761593 0.8621238
## 0.10 0.8761593 0.8621238
## 0.15 0.8761593 0.8621238
## 0.20 0.8761593 0.8621238
## 0.25 0.8761593 0.8621238
## 0.30 0.8761593 0.8621238
## 0.35 0.8761593 0.8621238
## 0.40 0.8761593 0.8621238
## 0.45 0.8761593 0.8621238
## 0.50 0.8761593 0.8621238
## 0.55 0.8761593 0.8621238
## 0.60 0.8761593 0.8621238
## 0.65 0.8761593 0.8621238
## 0.70 0.8761593 0.8621238
## 0.75 0.8761593 0.8621238
## 0.80 0.8761593 0.8621238
## 0.85 0.8761593 0.8621238
## 0.90 0.8761593 0.8621238
## 0.95 0.8761593 0.8621238
## 1.00 0.8761593 0.8621238
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.05.
#Predict for test dataset
predict_svm_linear <- predict(model_svm_linear, newdata = test_nb)
confusionMatrix(predict_svm_linear, test_nb$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8 9
## 0 41 0 0 0 0 1 1 1 0 1
## 1 0 48 2 2 0 2 0 1 1 0
## 2 0 1 29 0 0 0 2 0 0 0
## 3 0 0 2 35 0 1 0 0 4 0
## 4 0 0 2 0 36 2 1 0 1 5
## 5 0 0 0 3 0 28 2 0 2 1
## 6 1 0 3 0 0 0 37 0 0 0
## 7 0 1 1 1 0 1 0 35 0 0
## 8 0 0 0 0 1 2 0 0 33 0
## 9 0 0 0 1 2 1 0 4 0 36
##
## Overall Statistics
##
## Accuracy : 0.8565
## 95% CI : (0.8191, 0.8886)
## No Information Rate : 0.1196
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8403
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.97619 0.9600 0.74359 0.83333 0.92308 0.73684
## Specificity 0.98936 0.9783 0.99208 0.98138 0.97098 0.97895
## Pos Pred Value 0.91111 0.8571 0.90625 0.83333 0.76596 0.77778
## Neg Pred Value 0.99732 0.9945 0.97409 0.98138 0.99191 0.97382
## Prevalence 0.10048 0.1196 0.09330 0.10048 0.09330 0.09091
## Detection Rate 0.09809 0.1148 0.06938 0.08373 0.08612 0.06699
## Detection Prevalence 0.10766 0.1340 0.07656 0.10048 0.11244 0.08612
## Balanced Accuracy 0.98278 0.9691 0.86784 0.90736 0.94703 0.85789
## Class: 6 Class: 7 Class: 8 Class: 9
## Sensitivity 0.86047 0.85366 0.80488 0.83721
## Specificity 0.98933 0.98939 0.99204 0.97867
## Pos Pred Value 0.90244 0.89744 0.91667 0.81818
## Neg Pred Value 0.98408 0.98417 0.97906 0.98128
## Prevalence 0.10287 0.09809 0.09809 0.10287
## Detection Rate 0.08852 0.08373 0.07895 0.08612
## Detection Prevalence 0.09809 0.09330 0.08612 0.10526
## Balanced Accuracy 0.92490 0.92152 0.89846 0.90794
#The accuracy rate is 0.8684