# Section 1: Introduction
# Load Dataset and required library
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
library(class)
library(klaR)
## Warning: package 'klaR' was built under R version 3.5.3
## Loading required package: MASS
digit <- read.csv("C:/Users/shibo/OneDrive/Desktop/520 HW/Kaggle-digit-train-sample.csv")
# Principal Component Analysis
digit_pca <- prcomp(digit[,-1], scale = FALSE, center = TRUE)
# Principal Components Analysis to remove insignificant variables
VarExp <- digit_pca$sdev^2/sum(digit_pca$sdev^2)
VarExpcum <- cbind(1:784, cumsum(VarExp))
plot(cumsum(VarExp), xlab = "Significant Variables", ylab = "Percentage Explained", type = "o")
abline(h=1, col = "red")

# As we can see from the plot, the first 200 components explain around 97.5% of the dataset. In order to reduce the running time, we only choose the first 200 components
# Create new dataset
digit_final <- as.data.frame(cbind(label=as.factor(digit$label),digit_pca$x))
digit_final$label <- as.factor(digit_final$label)
# Create Training and Validation Datasets
split <- sample(1:nrow(digit_final), nrow(digit_final)*0.8)
train <- digit_final[split,]
test <- digit_final[-split,]
# Section 2: Naive Bayes
model_nb <- suppressWarnings(train(x = train[,2:200], y = train$label,method = "nb"))
predict_nb = suppressWarnings(predict(model_nb, test[1:280,2:200]))
confusionMatrix(predict_nb, test$label[1:280])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10
## 1 30 0 0 0 0 2 1 2 1 1
## 2 0 25 0 0 0 0 0 0 0 0
## 3 0 0 29 1 1 2 0 3 2 2
## 4 0 1 2 22 0 2 0 0 0 0
## 5 0 0 1 1 13 0 0 0 0 9
## 6 2 2 0 2 2 10 0 0 0 1
## 7 0 0 0 0 0 0 21 0 0 0
## 8 0 1 0 0 1 0 0 20 0 0
## 9 0 0 3 2 0 1 2 0 20 0
## 10 0 1 0 1 3 0 0 5 1 26
##
## Overall Statistics
##
## Accuracy : 0.7714
## 95% CI : (0.7177, 0.8193)
## No Information Rate : 0.1393
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7447
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.9375 0.83333 0.8286 0.75862 0.65000 0.58824
## Specificity 0.9718 1.00000 0.9551 0.98008 0.95769 0.96578
## Pos Pred Value 0.8108 1.00000 0.7250 0.81481 0.54167 0.52632
## Neg Pred Value 0.9918 0.98039 0.9750 0.97233 0.97266 0.97318
## Prevalence 0.1143 0.10714 0.1250 0.10357 0.07143 0.06071
## Detection Rate 0.1071 0.08929 0.1036 0.07857 0.04643 0.03571
## Detection Prevalence 0.1321 0.08929 0.1429 0.09643 0.08571 0.06786
## Balanced Accuracy 0.9546 0.91667 0.8918 0.86935 0.80385 0.77701
## Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity 0.87500 0.66667 0.83333 0.66667
## Specificity 1.00000 0.99200 0.96875 0.95436
## Pos Pred Value 1.00000 0.90909 0.71429 0.70270
## Neg Pred Value 0.98842 0.96124 0.98413 0.94650
## Prevalence 0.08571 0.10714 0.08571 0.13929
## Detection Rate 0.07500 0.07143 0.07143 0.09286
## Detection Prevalence 0.07500 0.07857 0.10000 0.13214
## Balanced Accuracy 0.93750 0.82933 0.90104 0.81051
# Section 3: K-Nearest Neighbor Method
model_knn <- train(x =train[,2:200], y = train$label, method = "knn")
predict_knn <- predict(model_knn, test[1:280,2:200])
confusionMatrix(predict_knn, test$label[1:280], positive = "pos")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10
## 1 31 0 0 0 0 0 0 0 0 0
## 2 0 29 0 0 0 0 1 3 0 0
## 3 0 0 27 0 0 0 0 0 0 0
## 4 0 0 1 28 0 1 0 0 0 0
## 5 0 0 0 1 17 1 0 0 1 1
## 6 1 0 0 0 0 12 0 0 1 0
## 7 0 0 1 0 0 1 23 0 0 0
## 8 0 1 2 0 0 0 0 26 1 0
## 9 0 0 3 0 0 1 0 0 20 0
## 10 0 0 1 0 3 1 0 1 1 38
##
## Overall Statistics
##
## Accuracy : 0.8964
## 95% CI : (0.8546, 0.9295)
## No Information Rate : 0.1393
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8842
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.9688 0.9667 0.77143 0.9655 0.85000 0.70588
## Specificity 1.0000 0.9840 1.00000 0.9920 0.98462 0.99240
## Pos Pred Value 1.0000 0.8788 1.00000 0.9333 0.80952 0.85714
## Neg Pred Value 0.9960 0.9960 0.96838 0.9960 0.98842 0.98120
## Prevalence 0.1143 0.1071 0.12500 0.1036 0.07143 0.06071
## Detection Rate 0.1107 0.1036 0.09643 0.1000 0.06071 0.04286
## Detection Prevalence 0.1107 0.1179 0.09643 0.1071 0.07500 0.05000
## Balanced Accuracy 0.9844 0.9753 0.88571 0.9788 0.91731 0.84914
## Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity 0.95833 0.86667 0.83333 0.9744
## Specificity 0.99219 0.98400 0.98438 0.9710
## Pos Pred Value 0.92000 0.86667 0.83333 0.8444
## Neg Pred Value 0.99608 0.98400 0.98438 0.9957
## Prevalence 0.08571 0.10714 0.08571 0.1393
## Detection Rate 0.08214 0.09286 0.07143 0.1357
## Detection Prevalence 0.08929 0.10714 0.08571 0.1607
## Balanced Accuracy 0.97526 0.92533 0.90885 0.9727
# Section 4: Support Vector Machine (SVM)
# Radial Method
set.seed(123)
model_svm_radial <- train(x = train[,2:200], y = train$label,
method = "svmRadial",
trControl = trainControl(method = "boot",number = 25))
predict_svm_radial = predict(model_svm_radial, test[1:280,2:200])
confusionMatrix(predict_svm_radial, test$label[1:280])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10
## 1 31 0 0 0 0 0 0 0 0 0
## 2 0 27 0 0 0 0 0 0 0 0
## 3 0 0 29 1 1 0 0 0 0 1
## 4 0 0 1 22 0 2 0 0 1 1
## 5 1 2 0 1 18 2 0 0 1 7
## 6 0 1 1 2 0 11 0 0 0 0
## 7 0 0 1 0 0 1 24 0 0 0
## 8 0 0 0 0 0 0 0 26 0 1
## 9 0 0 3 3 0 1 0 2 21 1
## 10 0 0 0 0 1 0 0 2 1 28
##
## Overall Statistics
##
## Accuracy : 0.8464
## 95% CI : (0.7988, 0.8866)
## No Information Rate : 0.1393
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8289
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.9688 0.90000 0.8286 0.75862 0.90000 0.64706
## Specificity 1.0000 1.00000 0.9878 0.98008 0.94615 0.98479
## Pos Pred Value 1.0000 1.00000 0.9062 0.81481 0.56250 0.73333
## Neg Pred Value 0.9960 0.98814 0.9758 0.97233 0.99194 0.97736
## Prevalence 0.1143 0.10714 0.1250 0.10357 0.07143 0.06071
## Detection Rate 0.1107 0.09643 0.1036 0.07857 0.06429 0.03929
## Detection Prevalence 0.1107 0.09643 0.1143 0.09643 0.11429 0.05357
## Balanced Accuracy 0.9844 0.95000 0.9082 0.86935 0.92308 0.81592
## Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity 1.00000 0.86667 0.87500 0.7179
## Specificity 0.99219 0.99600 0.96094 0.9834
## Pos Pred Value 0.92308 0.96296 0.67742 0.8750
## Neg Pred Value 1.00000 0.98419 0.98795 0.9556
## Prevalence 0.08571 0.10714 0.08571 0.1393
## Detection Rate 0.08571 0.09286 0.07500 0.1000
## Detection Prevalence 0.09286 0.09643 0.11071 0.1143
## Balanced Accuracy 0.99609 0.93133 0.91797 0.8507
# Linear Method
model_svm_linear <- suppressWarnings(train(x = train[,2:200], y = train$label,
method = "svmLinear",
trControl = trainControl(method = "boot", number = 25),
tuneGrid = expand.grid(C = seq(0, 1, 0.05))))
predict_svm_linear = suppressWarnings(predict(model_svm_linear, test[1:280,2:200]))
confusionMatrix(predict_svm_linear, test$label[1:280])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10
## 1 28 0 0 0 0 0 0 0 0 0
## 2 0 26 2 0 1 0 1 0 0 0
## 3 0 0 27 2 0 0 0 0 0 1
## 4 0 0 1 21 0 3 0 0 3 1
## 5 1 1 0 1 17 3 0 0 1 6
## 6 1 1 0 3 1 10 0 1 0 1
## 7 1 0 2 0 0 0 22 0 1 0
## 8 0 1 0 0 0 0 0 27 0 3
## 9 0 1 1 1 0 1 1 0 19 0
## 10 1 0 2 1 1 0 0 2 0 27
##
## Overall Statistics
##
## Accuracy : 0.8
## 95% CI : (0.7483, 0.8452)
## No Information Rate : 0.1393
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7771
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.8750 0.86667 0.77143 0.7241 0.85000 0.58824
## Specificity 1.0000 0.98400 0.98776 0.9681 0.95000 0.96958
## Pos Pred Value 1.0000 0.86667 0.90000 0.7241 0.56667 0.55556
## Neg Pred Value 0.9841 0.98400 0.96800 0.9681 0.98800 0.97328
## Prevalence 0.1143 0.10714 0.12500 0.1036 0.07143 0.06071
## Detection Rate 0.1000 0.09286 0.09643 0.0750 0.06071 0.03571
## Detection Prevalence 0.1000 0.10714 0.10714 0.1036 0.10714 0.06429
## Balanced Accuracy 0.9375 0.92533 0.87959 0.8461 0.90000 0.77891
## Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity 0.91667 0.90000 0.79167 0.69231
## Specificity 0.98438 0.98400 0.98047 0.97095
## Pos Pred Value 0.84615 0.87097 0.79167 0.79412
## Neg Pred Value 0.99213 0.98795 0.98047 0.95122
## Prevalence 0.08571 0.10714 0.08571 0.13929
## Detection Rate 0.07857 0.09643 0.06786 0.09643
## Detection Prevalence 0.09286 0.11071 0.08571 0.12143
## Balanced Accuracy 0.95052 0.94200 0.88607 0.83163
# Treebag
model_bag <- train(x = train[,2:200], y = train$label, method = "treebag")
predict_bag = predict(model_bag, test[1:280,2:200])
confusionMatrix(predict_bag, test$label[1:280])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8 9 10
## 1 28 0 0 0 0 1 0 0 1 2
## 2 0 28 0 0 1 0 1 3 0 0
## 3 0 0 28 0 1 1 0 2 1 0
## 4 2 0 1 22 0 1 0 0 3 0
## 5 0 0 0 0 11 1 0 0 0 8
## 6 1 1 0 2 2 12 1 4 1 1
## 7 1 0 2 2 0 0 22 0 1 1
## 8 0 0 0 0 1 0 0 18 0 2
## 9 0 1 3 1 2 0 0 2 15 0
## 10 0 0 1 2 2 1 0 1 2 25
##
## Overall Statistics
##
## Accuracy : 0.7464
## 95% CI : (0.6912, 0.7963)
## No Information Rate : 0.1393
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7174
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.8750 0.9333 0.8000 0.75862 0.55000 0.70588
## Specificity 0.9839 0.9800 0.9796 0.97211 0.96538 0.95057
## Pos Pred Value 0.8750 0.8485 0.8485 0.75862 0.55000 0.48000
## Neg Pred Value 0.9839 0.9919 0.9717 0.97211 0.96538 0.98039
## Prevalence 0.1143 0.1071 0.1250 0.10357 0.07143 0.06071
## Detection Rate 0.1000 0.1000 0.1000 0.07857 0.03929 0.04286
## Detection Prevalence 0.1143 0.1179 0.1179 0.10357 0.07143 0.08929
## Balanced Accuracy 0.9294 0.9567 0.8898 0.86537 0.75769 0.82823
## Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity 0.91667 0.60000 0.62500 0.64103
## Specificity 0.97266 0.98800 0.96484 0.96266
## Pos Pred Value 0.75862 0.85714 0.62500 0.73529
## Neg Pred Value 0.99203 0.95367 0.96484 0.94309
## Prevalence 0.08571 0.10714 0.08571 0.13929
## Detection Rate 0.07857 0.06429 0.05357 0.08929
## Detection Prevalence 0.10357 0.07500 0.08571 0.12143
## Balanced Accuracy 0.94466 0.79400 0.79492 0.80184
# Section 5: Algorithm Performance Comparison
model_comparison <- resamples(list(BAG = model_bag, SVMLinear = model_svm_linear, SVMRBF = model_svm_radial,
NB = model_nb, KNN = model_knn))
summary(model_comparison)
##
## Call:
## summary.resamples(object = model_comparison)
##
## Models: BAG, SVMLinear, SVMRBF, NB, KNN
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## BAG 0.7279597 0.7518248 0.7686747 0.7658776 0.7780430 0.7965686 0
## SVMLinear 0.7344498 0.7540984 0.7585366 0.7611364 0.7680798 0.7877358 0
## SVMRBF 0.7703016 0.7881773 0.7944162 0.7970731 0.8048780 0.8252427 0
## NB 0.7018349 0.7234043 0.7420925 0.7378445 0.7505995 0.7728337 0
## KNN 0.8195122 0.8413462 0.8543689 0.8532514 0.8689320 0.8839907 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## BAG 0.6970237 0.7230001 0.7420023 0.7393404 0.7528865 0.7738797 0
## SVMLinear 0.7039868 0.7265492 0.7313291 0.7339011 0.7414183 0.7637517 0
## SVMRBF 0.7448682 0.7640093 0.7714209 0.7742559 0.7828577 0.8055508 0
## NB 0.6688730 0.6924955 0.7131052 0.7084721 0.7224075 0.7474759 0
## KNN 0.7992377 0.8236994 0.8379473 0.8364229 0.8539093 0.8706909 0
# Graphically Compare Performance
scales <- list(x = list(relation = "free"),
y = list(relation = "free"))
bwplot(model_comparison, scales = scales)

# Conclusion: We can see the KNN Method has the best accuracy. Naive Bayes Method has the longest running time.