# Section 1: Introduction

# Load Dataset and required library
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
library(class)
library(klaR)
## Warning: package 'klaR' was built under R version 3.5.3
## Loading required package: MASS
digit <- read.csv("C:/Users/shibo/OneDrive/Desktop/520 HW/Kaggle-digit-train-sample.csv")

# Principal Component Analysis
digit_pca <- prcomp(digit[,-1], scale = FALSE, center = TRUE)

# Principal Components Analysis to remove insignificant variables
VarExp <- digit_pca$sdev^2/sum(digit_pca$sdev^2)
VarExpcum <- cbind(1:784, cumsum(VarExp))
plot(cumsum(VarExp), xlab = "Significant Variables", ylab = "Percentage Explained", type = "o")
abline(h=1, col = "red")

# As we can see from the plot, the first 200 components explain around 97.5% of the dataset. In order to reduce the running time, we only choose the first 200 components

# Create new dataset
digit_final <- as.data.frame(cbind(label=as.factor(digit$label),digit_pca$x))
digit_final$label <- as.factor(digit_final$label)

# Create Training and Validation Datasets
split <- sample(1:nrow(digit_final), nrow(digit_final)*0.8)
train <- digit_final[split,]
test <- digit_final[-split,]


# Section 2: Naive Bayes
model_nb <- suppressWarnings(train(x = train[,2:200], y = train$label,method = "nb"))
predict_nb = suppressWarnings(predict(model_nb, test[1:280,2:200]))
confusionMatrix(predict_nb, test$label[1:280])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4  5  6  7  8  9 10
##         1  30  0  0  0  0  2  1  2  1  1
##         2   0 25  0  0  0  0  0  0  0  0
##         3   0  0 29  1  1  2  0  3  2  2
##         4   0  1  2 22  0  2  0  0  0  0
##         5   0  0  1  1 13  0  0  0  0  9
##         6   2  2  0  2  2 10  0  0  0  1
##         7   0  0  0  0  0  0 21  0  0  0
##         8   0  1  0  0  1  0  0 20  0  0
##         9   0  0  3  2  0  1  2  0 20  0
##         10  0  1  0  1  3  0  0  5  1 26
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7714          
##                  95% CI : (0.7177, 0.8193)
##     No Information Rate : 0.1393          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7447          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.9375  0.83333   0.8286  0.75862  0.65000  0.58824
## Specificity            0.9718  1.00000   0.9551  0.98008  0.95769  0.96578
## Pos Pred Value         0.8108  1.00000   0.7250  0.81481  0.54167  0.52632
## Neg Pred Value         0.9918  0.98039   0.9750  0.97233  0.97266  0.97318
## Prevalence             0.1143  0.10714   0.1250  0.10357  0.07143  0.06071
## Detection Rate         0.1071  0.08929   0.1036  0.07857  0.04643  0.03571
## Detection Prevalence   0.1321  0.08929   0.1429  0.09643  0.08571  0.06786
## Balanced Accuracy      0.9546  0.91667   0.8918  0.86935  0.80385  0.77701
##                      Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity           0.87500  0.66667  0.83333   0.66667
## Specificity           1.00000  0.99200  0.96875   0.95436
## Pos Pred Value        1.00000  0.90909  0.71429   0.70270
## Neg Pred Value        0.98842  0.96124  0.98413   0.94650
## Prevalence            0.08571  0.10714  0.08571   0.13929
## Detection Rate        0.07500  0.07143  0.07143   0.09286
## Detection Prevalence  0.07500  0.07857  0.10000   0.13214
## Balanced Accuracy     0.93750  0.82933  0.90104   0.81051
# Section 3: K-Nearest Neighbor Method
model_knn <- train(x =train[,2:200], y = train$label, method = "knn")
predict_knn <- predict(model_knn, test[1:280,2:200])
confusionMatrix(predict_knn, test$label[1:280], positive = "pos")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4  5  6  7  8  9 10
##         1  31  0  0  0  0  0  0  0  0  0
##         2   0 29  0  0  0  0  1  3  0  0
##         3   0  0 27  0  0  0  0  0  0  0
##         4   0  0  1 28  0  1  0  0  0  0
##         5   0  0  0  1 17  1  0  0  1  1
##         6   1  0  0  0  0 12  0  0  1  0
##         7   0  0  1  0  0  1 23  0  0  0
##         8   0  1  2  0  0  0  0 26  1  0
##         9   0  0  3  0  0  1  0  0 20  0
##         10  0  0  1  0  3  1  0  1  1 38
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8964          
##                  95% CI : (0.8546, 0.9295)
##     No Information Rate : 0.1393          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8842          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.9688   0.9667  0.77143   0.9655  0.85000  0.70588
## Specificity            1.0000   0.9840  1.00000   0.9920  0.98462  0.99240
## Pos Pred Value         1.0000   0.8788  1.00000   0.9333  0.80952  0.85714
## Neg Pred Value         0.9960   0.9960  0.96838   0.9960  0.98842  0.98120
## Prevalence             0.1143   0.1071  0.12500   0.1036  0.07143  0.06071
## Detection Rate         0.1107   0.1036  0.09643   0.1000  0.06071  0.04286
## Detection Prevalence   0.1107   0.1179  0.09643   0.1071  0.07500  0.05000
## Balanced Accuracy      0.9844   0.9753  0.88571   0.9788  0.91731  0.84914
##                      Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity           0.95833  0.86667  0.83333    0.9744
## Specificity           0.99219  0.98400  0.98438    0.9710
## Pos Pred Value        0.92000  0.86667  0.83333    0.8444
## Neg Pred Value        0.99608  0.98400  0.98438    0.9957
## Prevalence            0.08571  0.10714  0.08571    0.1393
## Detection Rate        0.08214  0.09286  0.07143    0.1357
## Detection Prevalence  0.08929  0.10714  0.08571    0.1607
## Balanced Accuracy     0.97526  0.92533  0.90885    0.9727
# Section 4: Support Vector Machine (SVM)

# Radial Method
set.seed(123)
model_svm_radial <- train(x = train[,2:200], y = train$label, 
                                     method = "svmRadial",
                                     trControl = trainControl(method = "boot",number = 25))

predict_svm_radial = predict(model_svm_radial, test[1:280,2:200])
confusionMatrix(predict_svm_radial, test$label[1:280])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4  5  6  7  8  9 10
##         1  31  0  0  0  0  0  0  0  0  0
##         2   0 27  0  0  0  0  0  0  0  0
##         3   0  0 29  1  1  0  0  0  0  1
##         4   0  0  1 22  0  2  0  0  1  1
##         5   1  2  0  1 18  2  0  0  1  7
##         6   0  1  1  2  0 11  0  0  0  0
##         7   0  0  1  0  0  1 24  0  0  0
##         8   0  0  0  0  0  0  0 26  0  1
##         9   0  0  3  3  0  1  0  2 21  1
##         10  0  0  0  0  1  0  0  2  1 28
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8464          
##                  95% CI : (0.7988, 0.8866)
##     No Information Rate : 0.1393          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8289          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.9688  0.90000   0.8286  0.75862  0.90000  0.64706
## Specificity            1.0000  1.00000   0.9878  0.98008  0.94615  0.98479
## Pos Pred Value         1.0000  1.00000   0.9062  0.81481  0.56250  0.73333
## Neg Pred Value         0.9960  0.98814   0.9758  0.97233  0.99194  0.97736
## Prevalence             0.1143  0.10714   0.1250  0.10357  0.07143  0.06071
## Detection Rate         0.1107  0.09643   0.1036  0.07857  0.06429  0.03929
## Detection Prevalence   0.1107  0.09643   0.1143  0.09643  0.11429  0.05357
## Balanced Accuracy      0.9844  0.95000   0.9082  0.86935  0.92308  0.81592
##                      Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity           1.00000  0.86667  0.87500    0.7179
## Specificity           0.99219  0.99600  0.96094    0.9834
## Pos Pred Value        0.92308  0.96296  0.67742    0.8750
## Neg Pred Value        1.00000  0.98419  0.98795    0.9556
## Prevalence            0.08571  0.10714  0.08571    0.1393
## Detection Rate        0.08571  0.09286  0.07500    0.1000
## Detection Prevalence  0.09286  0.09643  0.11071    0.1143
## Balanced Accuracy     0.99609  0.93133  0.91797    0.8507
# Linear Method
model_svm_linear <- suppressWarnings(train(x = train[,2:200], y = train$label,
                          method = "svmLinear",
                          trControl = trainControl(method = "boot", number = 25),
                          tuneGrid = expand.grid(C = seq(0, 1, 0.05))))

predict_svm_linear = suppressWarnings(predict(model_svm_linear, test[1:280,2:200]))
confusionMatrix(predict_svm_linear, test$label[1:280])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4  5  6  7  8  9 10
##         1  28  0  0  0  0  0  0  0  0  0
##         2   0 26  2  0  1  0  1  0  0  0
##         3   0  0 27  2  0  0  0  0  0  1
##         4   0  0  1 21  0  3  0  0  3  1
##         5   1  1  0  1 17  3  0  0  1  6
##         6   1  1  0  3  1 10  0  1  0  1
##         7   1  0  2  0  0  0 22  0  1  0
##         8   0  1  0  0  0  0  0 27  0  3
##         9   0  1  1  1  0  1  1  0 19  0
##         10  1  0  2  1  1  0  0  2  0 27
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7483, 0.8452)
##     No Information Rate : 0.1393          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7771          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.8750  0.86667  0.77143   0.7241  0.85000  0.58824
## Specificity            1.0000  0.98400  0.98776   0.9681  0.95000  0.96958
## Pos Pred Value         1.0000  0.86667  0.90000   0.7241  0.56667  0.55556
## Neg Pred Value         0.9841  0.98400  0.96800   0.9681  0.98800  0.97328
## Prevalence             0.1143  0.10714  0.12500   0.1036  0.07143  0.06071
## Detection Rate         0.1000  0.09286  0.09643   0.0750  0.06071  0.03571
## Detection Prevalence   0.1000  0.10714  0.10714   0.1036  0.10714  0.06429
## Balanced Accuracy      0.9375  0.92533  0.87959   0.8461  0.90000  0.77891
##                      Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity           0.91667  0.90000  0.79167   0.69231
## Specificity           0.98438  0.98400  0.98047   0.97095
## Pos Pred Value        0.84615  0.87097  0.79167   0.79412
## Neg Pred Value        0.99213  0.98795  0.98047   0.95122
## Prevalence            0.08571  0.10714  0.08571   0.13929
## Detection Rate        0.07857  0.09643  0.06786   0.09643
## Detection Prevalence  0.09286  0.11071  0.08571   0.12143
## Balanced Accuracy     0.95052  0.94200  0.88607   0.83163
# Treebag
model_bag <- train(x = train[,2:200], y = train$label, method = "treebag")

predict_bag = predict(model_bag, test[1:280,2:200])
confusionMatrix(predict_bag, test$label[1:280])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3  4  5  6  7  8  9 10
##         1  28  0  0  0  0  1  0  0  1  2
##         2   0 28  0  0  1  0  1  3  0  0
##         3   0  0 28  0  1  1  0  2  1  0
##         4   2  0  1 22  0  1  0  0  3  0
##         5   0  0  0  0 11  1  0  0  0  8
##         6   1  1  0  2  2 12  1  4  1  1
##         7   1  0  2  2  0  0 22  0  1  1
##         8   0  0  0  0  1  0  0 18  0  2
##         9   0  1  3  1  2  0  0  2 15  0
##         10  0  0  1  2  2  1  0  1  2 25
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7464          
##                  95% CI : (0.6912, 0.7963)
##     No Information Rate : 0.1393          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7174          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity            0.8750   0.9333   0.8000  0.75862  0.55000  0.70588
## Specificity            0.9839   0.9800   0.9796  0.97211  0.96538  0.95057
## Pos Pred Value         0.8750   0.8485   0.8485  0.75862  0.55000  0.48000
## Neg Pred Value         0.9839   0.9919   0.9717  0.97211  0.96538  0.98039
## Prevalence             0.1143   0.1071   0.1250  0.10357  0.07143  0.06071
## Detection Rate         0.1000   0.1000   0.1000  0.07857  0.03929  0.04286
## Detection Prevalence   0.1143   0.1179   0.1179  0.10357  0.07143  0.08929
## Balanced Accuracy      0.9294   0.9567   0.8898  0.86537  0.75769  0.82823
##                      Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity           0.91667  0.60000  0.62500   0.64103
## Specificity           0.97266  0.98800  0.96484   0.96266
## Pos Pred Value        0.75862  0.85714  0.62500   0.73529
## Neg Pred Value        0.99203  0.95367  0.96484   0.94309
## Prevalence            0.08571  0.10714  0.08571   0.13929
## Detection Rate        0.07857  0.06429  0.05357   0.08929
## Detection Prevalence  0.10357  0.07500  0.08571   0.12143
## Balanced Accuracy     0.94466  0.79400  0.79492   0.80184
# Section 5: Algorithm Performance Comparison
model_comparison <- resamples(list(BAG = model_bag, SVMLinear = model_svm_linear, SVMRBF = model_svm_radial,
                                   NB = model_nb, KNN = model_knn))

summary(model_comparison)
## 
## Call:
## summary.resamples(object = model_comparison)
## 
## Models: BAG, SVMLinear, SVMRBF, NB, KNN 
## Number of resamples: 25 
## 
## Accuracy 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## BAG       0.7279597 0.7518248 0.7686747 0.7658776 0.7780430 0.7965686    0
## SVMLinear 0.7344498 0.7540984 0.7585366 0.7611364 0.7680798 0.7877358    0
## SVMRBF    0.7703016 0.7881773 0.7944162 0.7970731 0.8048780 0.8252427    0
## NB        0.7018349 0.7234043 0.7420925 0.7378445 0.7505995 0.7728337    0
## KNN       0.8195122 0.8413462 0.8543689 0.8532514 0.8689320 0.8839907    0
## 
## Kappa 
##                Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## BAG       0.6970237 0.7230001 0.7420023 0.7393404 0.7528865 0.7738797    0
## SVMLinear 0.7039868 0.7265492 0.7313291 0.7339011 0.7414183 0.7637517    0
## SVMRBF    0.7448682 0.7640093 0.7714209 0.7742559 0.7828577 0.8055508    0
## NB        0.6688730 0.6924955 0.7131052 0.7084721 0.7224075 0.7474759    0
## KNN       0.7992377 0.8236994 0.8379473 0.8364229 0.8539093 0.8706909    0
# Graphically Compare Performance
scales <- list(x = list(relation = "free"),
               y = list(relation = "free"))
 
bwplot(model_comparison, scales = scales)

# Conclusion: We can see the KNN Method has the best accuracy. Naive Bayes Method has the longest running time.