library(e1071)
library(ipred)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
library(tidymodels)
## -- Attaching packages ---------------------------------------------------------- tidymodels 0.1.1 --
## v broom     0.7.1      v recipes   0.1.15
## v dials     0.0.9      v rsample   0.0.8 
## v infer     0.5.3      v tune      0.1.2 
## v modeldata 0.1.0      v workflows 0.2.1 
## v parsnip   0.1.4      v yardstick 0.0.7
## -- Conflicts ------------------------------------------------------------- tidymodels_conflicts() --
## x scales::discard()        masks purrr::discard()
## x dplyr::filter()          masks stats::filter()
## x recipes::fixed()         masks stringr::fixed()
## x dplyr::lag()             masks stats::lag()
## x purrr::lift()            masks caret::lift()
## x yardstick::precision()   masks caret::precision()
## x yardstick::recall()      masks caret::recall()
## x yardstick::sensitivity() masks caret::sensitivity()
## x yardstick::spec()        masks readr::spec()
## x yardstick::specificity() masks caret::specificity()
## x recipes::step()          masks stats::step()
## x tune::tune()             masks e1071::tune()

Get the data

set.seed(1234)

data <- read.csv('https://raw.githubusercontent.com/san123i/CUNY/master/Semester4/622-MachineLearning/Assignments/Assignment1/HW1.csv', header = TRUE) %>%   mutate_if(is.character,as.factor)

Split the data

#Split the data
ind <- sample(2, nrow(data), replace = T, prob=c(0.7,0.3))
train <- data[ind ==1,]
test <- data[ind==2,]

Bagging model and predictions

#Train the bagging model using Ipred package

bag_model <- bagging(formula = label ~ ., data = train,  nbagg = 30)

bag_model
## 
## Bagging classification trees with 30 bootstrap replications 
## 
## Call: bagging.data.frame(formula = label ~ ., data = train, nbagg = 30)
#Prediction for bagging model
bag_model_pred <- predict(bag_model, test)

bag_test_pred <- cbind(test, prediction_values = bag_model_pred)

Confusion matrix for Bagging

bag_cm <- confusionMatrix(table(bag_test_pred$label,bag_test_pred$prediction_values))

bag_cm
## Confusion Matrix and Statistics
## 
##        
##         BLACK BLUE
##   BLACK     6    0
##   BLUE      0    1
##                                      
##                Accuracy : 1          
##                  95% CI : (0.5904, 1)
##     No Information Rate : 0.8571     
##     P-Value [Acc > NIR] : 0.3399     
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8571     
##          Detection Rate : 0.8571     
##    Detection Prevalence : 0.8571     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : BLACK      
## 

LOOCV Model and predictions

#LOOCV Model
N <- nrow(train)
model_loocv_train <- do.call('rbind', lapply(1:N, FUN = function(idx, data = train){
  
   model_nb<-naiveBayes(label~.,data=data[-idx,])
  
  list(fold = idx, model = model_nb, actual = data[idx, 3])
  
})) %>% data.frame()
model_loocv_train
##    fold
## 1     1
## 2     2
## 3     3
## 4     4
## 5     5
## 6     6
## 7     7
## 8     8
## 9     9
## 10   10
## 11   11
## 12   12
## 13   13
## 14   14
## 15   15
## 16   16
## 17   17
## 18   18
## 19   19
## 20   20
## 21   21
## 22   22
## 23   23
## 24   24
## 25   25
## 26   26
## 27   27
## 28   28
## 29   29
##                                                                                                                                                                                                                                                                   model
## 1                                16, 12, 38.875, 36.5, 19.96622, 22.45197, 0.25, 0.08333333, 0.125, 0.1666667, 0.0625, 0.4166667, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 2       15, 13, 41.13333, 34.07692, 18.43082, 23.20367, 0.2666667, 0.1538462, 0.06666667, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 3                                 16, 12, 38.875, 36.5, 19.96622, 22.45197, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.3333333, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 4       15, 13, 41.13333, 34.07692, 18.43082, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.06666667, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 5              15, 13, 41.13333, 34.07692, 18.43082, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 6                            16, 12, 38.875, 35.33333, 19.96622, 23.76909, 0.25, 0.08333333, 0.125, 0.1666667, 0.0625, 0.4166667, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 7                            16, 12, 38.875, 35.33333, 19.96622, 23.76909, 0.25, 0.1666667, 0.125, 0.08333333, 0.0625, 0.4166667, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 8                             16, 12, 38.875, 35.33333, 19.96622, 23.76909, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.3333333, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 9                            16, 12, 38.875, 35.33333, 19.96622, 23.76909, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.4166667, 0.125, 0.08333333, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 10     15, 13, 40.2, 34.07692, 19.92558, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.1333333, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 11                                     16, 12, 38.875, 35.33333, 19.96622, 23.76909, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.4166667, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 12             15, 13, 39.13333, 34.07692, 20.63931, 23.20367, 0.2, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 13                                  16, 12, 38.875, 34, 19.96622, 24.23371, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.3333333, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 14 15, 13, 39.13333, 34.07692, 20.63931, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.1333333, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 15             15, 13, 39.13333, 34.07692, 20.63931, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 16             15, 13, 38.06667, 34.07692, 20.39421, 23.20367, 0.2, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 17      15, 13, 38.06667, 34.07692, 20.39421, 23.20367, 0.2666667, 0.1538462, 0.06666667, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 18                            16, 12, 38.875, 32.66667, 19.96622, 23.64638, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.3333333, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 19      15, 13, 38.06667, 34.07692, 20.39421, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.06666667, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 20 15, 13, 38.06667, 34.07692, 20.39421, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.1333333, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 21             15, 13, 38.06667, 34.07692, 20.39421, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 22                 15, 13, 37.8, 34.07692, 20.18203, 23.20367, 0.2, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 23                    15, 13, 37.8, 34.07692, 20.18203, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 24                 15, 13, 37.8, 34.07692, 20.18203, 23.20367, 0.2666667, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 25             15, 13, 37.26667, 34.07692, 19.56479, 23.20367, 0.2, 0.1538462, 0.1333333, 0.1538462, 0.06666667, 0.3846154, 0.1333333, 0.1538462, 0.2, 0.07692308, 0.2666667, 0.07692308, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 26                           16, 12, 38.875, 31.66667, 19.96622, 22.47153, 0.25, 0.1666667, 0.125, 0.08333333, 0.0625, 0.4166667, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 27                            16, 12, 38.875, 31.66667, 19.96622, 22.47153, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.3333333, 0.125, 0.1666667, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 28                           16, 12, 38.875, 31.66667, 19.96622, 22.47153, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.4166667, 0.125, 0.08333333, 0.1875, 0.08333333, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
## 29                                     16, 12, 38.875, 31.66667, 19.96622, 22.47153, 0.25, 0.1666667, 0.125, 0.1666667, 0.0625, 0.4166667, 0.125, 0.1666667, 0.1875, 0, 0.25, 0.08333333, BLACK, BLUE, TRUE, FALSE, naiveBayes.default(x = X, y = Y, laplace = laplace)
##    actual
## 1       2
## 2       1
## 3       2
## 4       1
## 5       1
## 6       2
## 7       2
## 8       2
## 9       2
## 10      1
## 11      2
## 12      1
## 13      2
## 14      1
## 15      1
## 16      1
## 17      1
## 18      2
## 19      1
## 20      1
## 21      1
## 22      1
## 23      1
## 24      1
## 25      1
## 26      2
## 27      2
## 28      2
## 29      2
#LOOCV predictions
model_loocv_pred <- do.call('rbind', lapply(model_loocv_train$model, FUN = function(m, data = test){
  
  # make prediction for individual models
  prediction <- predict(m, test)
  pred <- cbind(prediction,test) 
}))

Confusion matrix for LOOCV

loocv_cfm <- table(model_loocv_pred$label,model_loocv_pred$prediction) %>%  confusionMatrix()

loocv_cfm
## Confusion Matrix and Statistics
## 
##        
##         BLACK BLUE
##   BLACK   166    8
##   BLUE     29    0
##                                           
##                Accuracy : 0.8177          
##                  95% CI : (0.7576, 0.8683)
##     No Information Rate : 0.9606          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : -0.0658         
##                                           
##  Mcnemar's Test P-Value : 0.001009        
##                                           
##             Sensitivity : 0.8513          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9540          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.9606          
##          Detection Rate : 0.8177          
##    Detection Prevalence : 0.8571          
##       Balanced Accuracy : 0.4256          
##                                           
##        'Positive' Class : BLACK           
## 

Comparing the models

From the HW1 assignment, comparing the results of KNN, NB and LR with Bagging and LOOCV

#Comparing previous results
test_accu_LR <- 0.8571
test_accu_knn3 <- 1
test_accu_nb <- 0.8751

cbind(Model = c('LR',
                'KNN_3', 
                'NB', 
                'BAGGING', 
                'LOOCV'), 
      combined_accuracy = c(test_accu_LR, test_accu_knn3,
                                  test_accu_nb,
                                  bag_cm$overall[1],
                                  loocv_cfm$overall[1])) %>%
  data.frame(row.names = 'Model') %>% arrange(desc(combined_accuracy))
##         combined_accuracy
## KNN_3                   1
## BAGGING                 1
## NB                 0.8751
## LR                 0.8571
## LOOCV   0.817733990147783

Observations

  1. Comparatively Bagging performed much better when compared to LOOCV. It was on par with KNN3 which performed the best in my previous assignment.
  2. LOOCV would suffer from low bias and high variance due to the leave one out approach.
  3. Also, Bagging has been computationally quick when compared to LOOCV.