library(ipred)
library(e1071)
path <- "file:///Users/bobo/Documents/622/HW1/15001956_p1_dataset_hw1.csv"
data <-read.csv(path, header=TRUE, sep=',', stringsAsFactors=FALSE)

Review the data, quick and easy EDA

head(data)
##   X Y label
## 1 5 a  BLUE
## 2 5 b BLACK
## 3 5 c  BLUE
## 4 5 d BLACK
## 5 5 e BLACK
## 6 5 f BLACK
str(data)
## 'data.frame':    36 obs. of  3 variables:
##  $ X    : int  5 5 5 5 5 5 19 19 19 19 ...
##  $ Y    : chr  "a" "b" "c" "d" ...
##  $ label: chr  "BLUE" "BLACK" "BLUE" "BLACK" ...
data$Y <- as.factor(data$Y)
data$label <- as.factor(data$label)
table(data$label)
## 
## BLACK  BLUE 
##    22    14

Bagging

Let us split our data into 70% training and 30% testing datasets.

set.seed(11)
tstidx <- caret::createDataPartition(data$label, p=0.7, list=FALSE)
trdata <- data[tstidx,]
tsdata <- data[-tstidx,]

summary(trdata)
##        X         Y       label   
##  Min.   : 5.00   a:4   BLACK:16  
##  1st Qu.:19.00   b:5   BLUE :10  
##  Median :51.00   c:5             
##  Mean   :40.23   d:6             
##  3rd Qu.:55.00   e:4             
##  Max.   :63.00   f:2

Bagging Model

bagging_model <- bagging(label ~ .,
                         data = trdata,
                         nbagg = 100,
                         coob = TRUE
                         )

bagging_model
## 
## Bagging classification trees with 100 bootstrap replications 
## 
## Call: bagging.data.frame(formula = label ~ ., data = trdata, nbagg = 100, 
##     coob = TRUE)
## 
## Out-of-bag estimate of misclassification error:  0.3077

Prediction vs actual

bagging_pred <- predict(bagging_model, tsdata)

bagging_table <- table(bagging_pred, tsdata$label)
bagging_table
##             
## bagging_pred BLACK BLUE
##        BLACK     3    2
##        BLUE      3    2
tn1 <- bagging_table[1,1]
fn1 <- bagging_table[1,2]
fp1 <- bagging_table[2,1]
tp1 <- bagging_table[2,2]
accuracy1 <- (tp1+tn1)/(tp1+tn1+fp1+fn1)
tpr1 <- tp1/(tp1+fn1)
tnr1 <- tn1/(tn1+fp1)
fnr1 <- 1-tpr1
fpr1 <- 1-tnr1

# change label to numeric since predictor must be numeric or ordered
tsdata_num <- ifelse(tsdata$label == "BLUE", 1, 0)

auc1 <- pROC::auc(pROC::roc(bagging_pred, tsdata_num))
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases

Jacknife: Leave-One-Out Cross Validation (LOOCV)

N<-nrow(trdata)
N
## [1] 26
trdata$label <- ifelse(trdata$label == 'BLUE', 1, 0)

cvjdata  <- do.call('rbind',lapply(1:N,FUN=function(idx, data=trdata) { 
  m <- naiveBayes(label~., data = data[-idx,]) 
  p <- predict(m, data[idx,-c(3)], type='raw') 
  pc <- unlist(apply(round(p), 1, which.max))-1 
  list("fold"=idx, "m"=m, "predicted"=pc, "actual" = data[idx,c(3)])
  }
))
head(cvjdata)
##      fold m      predicted actual
## [1,] 1    List,5 0         0     
## [2,] 2    List,5 1         1     
## [3,] 3    List,5 1         0     
## [4,] 4    List,5 0         0     
## [5,] 5    List,5 0         1     
## [6,] 6    List,5 0         1
cvjdata <- as.data.frame(cvjdata)

loocv_table <- table(as.numeric(cvjdata$actual), as.numeric(cvjdata$predicted))

loocv_cfm <- caret::confusionMatrix((loocv_table))
loocv_cfm
## Confusion Matrix and Statistics
## 
##    
##      0  1
##   0 14  2
##   1  4  6
##                                           
##                Accuracy : 0.7692          
##                  95% CI : (0.5635, 0.9103)
##     No Information Rate : 0.6923          
##     P-Value [Acc > NIR] : 0.2679          
##                                           
##                   Kappa : 0.4935          
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.7778          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.8750          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.6923          
##          Detection Rate : 0.5385          
##    Detection Prevalence : 0.6154          
##       Balanced Accuracy : 0.7639          
##                                           
##        'Positive' Class : 0               
## 
# now we have to apply the training models to testdata and average them 
# since this is classification we will take the majority vote
# double loop

tsdata$label <- ifelse(tsdata$label == 'BLUE', 1, 0)

cvj.perf<-as.data.frame(do.call('cbind',lapply(cvjdata$m,FUN=function(m,data=tsdata)
{
  v<-predict(m,data[,-c(3)],type='raw')
  lbllist<-unlist(apply(round(v),1,which.max))-1
})))

np<-ncol(cvj.perf)

predclass<-unlist(apply(cvj.perf,1,FUN=function(v){ ifelse(sum(v[2:length(v)])/np<0.5,0,1)}))

loocv_table <-table(tsdata[,3],predclass)

(loocv_cfm2<-caret::confusionMatrix(loocv_table))
## Confusion Matrix and Statistics
## 
##    predclass
##     0 1
##   0 2 4
##   1 3 1
##                                           
##                Accuracy : 0.3             
##                  95% CI : (0.0667, 0.6525)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.9453          
##                                           
##                   Kappa : -0.4            
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.4000          
##             Specificity : 0.2000          
##          Pos Pred Value : 0.3333          
##          Neg Pred Value : 0.2500          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2000          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.3000          
##                                           
##        'Positive' Class : 0               
## 
auc2 <- pROC::auc(pROC::roc(predclass, tsdata_num))
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases

Comparing Models

Algorithm <- c("LR", "NB", "kNN", "Bagging", "LOOCV")
AUC <- c(0.56, 0.52, 0.75, auc1, auc2)
Accuracy <- c(0.4, 0.7, 0.64, accuracy1, 0.3)
TPR <- c(0.4, 0.625, 0.67, tpr1, 0.33)
FPR <- c(1-0.4, 1-0.625, 1-0.67, fpr1, 1-0.33)
TNR <- c(0.4, 1, 0.5, tnr1, 0.25)
FNR <- c(1-0.4, 1-1, 1-0.5, fnr1, 1-0.25)
df <- data.frame(Algorithm, AUC, Accuracy, TPR, FPR, TNR, FNR)
df
##   Algorithm  AUC Accuracy   TPR   FPR  TNR  FNR
## 1        LR 0.56     0.40 0.400 0.600 0.40 0.60
## 2        NB 0.52     0.70 0.625 0.375 1.00 0.00
## 3       kNN 0.75     0.64 0.670 0.330 0.50 0.50
## 4   Bagging 0.50     0.50 0.500 0.500 0.50 0.50
## 5     LOOCV 0.70     0.30 0.330 0.670 0.25 0.75

Interestingly, Bagging resulted in 0.50 across all metrics in the table above, while LOOCV resulted in high AUC and low accuracy, TPR and TNR. Despite Bagging outperformed LOOCV, its accuracy is lower than both NB and kNN. Perhaps models with bagging and cross validation methods do not necessarily do better than simple models.