(A) Run Bagging (ipred)

About ipred: Improved Predictors

Improved predictive models by indirect classification and bagging for classification, regression and survival problems as well as resampling based estimators of prediction error. You can see details here: https://cran.r-project.org/web/packages/ipred/ipred.pdf

Data Preparation

Here we are preparing the data by loading the data and examining the structure of the data set.

data <- read.csv("data622hw1.csv", header = TRUE)

Examine the sample data, its structure and summary

head(data)
##   X Y label
## 1 5 a  BLUE
## 2 5 b BLACK
## 3 5 c  BLUE
## 4 5 d BLACK
## 5 5 e BLACK
## 6 5 f BLACK
str(data)
## 'data.frame':    36 obs. of  3 variables:
##  $ X    : int  5 5 5 5 5 5 19 19 19 19 ...
##  $ Y    : Factor w/ 6 levels "a","b","c","d",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ label: Factor w/ 2 levels "BLACK","BLUE": 2 1 2 1 1 1 2 2 2 2 ...
summary(data)
##        X      Y       label   
##  Min.   : 5   a:6   BLACK:22  
##  1st Qu.:19   b:6   BLUE :14  
##  Median :43   c:6             
##  Mean   :38   d:6             
##  3rd Qu.:55   e:6             
##  Max.   :63   f:6

Bagging

Split train and test data. Train on 70% and test on 30%.

set.seed(40)
traindata.index <- createDataPartition(data$label, p = 0.7, list = FALSE)
traindata <- data[traindata.index,]
testdata  <- data[-traindata.index,]

summary(traindata)
##        X         Y       label   
##  Min.   : 5.00   a:3   BLACK:16  
##  1st Qu.:19.00   b:4   BLUE :10  
##  Median :51.00   c:5             
##  Mean   :39.15   d:5             
##  3rd Qu.:55.00   e:5             
##  Max.   :63.00   f:4
summary(testdata)
##        X      Y       label  
##  Min.   : 5   a:3   BLACK:6  
##  1st Qu.:23   b:2   BLUE :4  
##  Median :35   c:1            
##  Mean   :35   d:1            
##  3rd Qu.:50   e:1            
##  Max.   :63   f:2
# Bagging
bgdatamodel <- bagging(label ~ ., data=traindata, nbagg = 100, coob = TRUE)
bgdatamodel
## 
## Bagging classification trees with 100 bootstrap replications 
## 
## Call: bagging.data.frame(formula = label ~ ., data = traindata, nbagg = 100, 
##     coob = TRUE)
## 
## Out-of-bag estimate of misclassification error:  0.3462
predictdatat <- predict(bgdatamodel, testdata)

Model Metrics

with(testdata, table(predictdatat, label))
##             label
## predictdatat BLACK BLUE
##        BLACK     4    0
##        BLUE      2    4
funcStats <- function(tn, fn, fp, tp, auc) {

  tpr <- tp / (tp + fn)
  tnr <- tn / (tn + fp)
  fnr <- 1 - tpr
  fpr <- 1 - tnr
  acc <- (tp + tn) / (tp + tn + fp + fn)

  tblstats <- matrix(c(tpr,tnr,fnr,fpr, auc, acc),ncol=1, byrow=TRUE)
  colnames(tblstats) <- c("Value")
  rownames(tblstats) <- c("TP","TN","FN", "FP", "AUC", "ACC")
  tblstats
}
prddata <- table(predictdatat, testdata$label)
prddata
##             
## predictdatat BLACK BLUE
##        BLACK     4    0
##        BLUE      2    4
tn <- prddata[1,1]
fn <- prddata[1,2]
fp <- prddata[2,1]
tp <- prddata[2,2]

dtlbl <- ifelse(testdata$label == 'BLUE', 1, 0)

# Area under the ROC curve (AUC) with the trapezoidal rule
auc <- auc(roc(predictdatat, dtlbl))
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases
tblStats <- funcStats(tn, fn, fp, tp, auc)
tblStats
##         Value
## TP  1.0000000
## TN  0.6666667
## FN  0.0000000
## FP  0.3333333
## AUC 0.8333333
## ACC 0.8000000

(B) Run LOOCV (jacknife)

About LOOCV

Leave-one-out-cross-validation (LOOCV) leaves out only 1 data point, and does that for each data point in turn. Thus, LOOCV requires N model evaluations (N is the number of data points), which is costly for large N. The advantage is that the procedure delivers exactly the same results every time, because all possible options are being evaluated.

rowcount <- nrow(traindata)
print(rowcount)
## [1] 26
traindata$label <- ifelse(traindata$label == 'BLUE', 1, 0)
cvjdata  <- do.call('rbind',lapply(1:rowcount,FUN=function(idx, data=traindata) { 
  m <- naiveBayes(label~., data = data[-idx,]) 
  p <- predict(m, data[idx,-c(3)], type='raw') 
  pc <- unlist(apply(round(p), 1, which.max))-1 
  list("fold"=idx, "m"=m, "predicted"=pc, "actual" = data[idx,c(3)])
  }
))

head(cvjdata)
##      fold m      predicted actual
## [1,] 1    List,5 0         1     
## [2,] 2    List,5 1         1     
## [3,] 3    List,5 1         0     
## [4,] 4    List,5 1         0     
## [5,] 5    List,5 0         1     
## [6,] 6    List,5 1         1
cvjdata <- as.data.frame(cvjdata)

loocv_tbl <- table(as.numeric(cvjdata$actual), as.numeric(cvjdata$predicted))

(loocv_caret_cfm <- caret::confusionMatrix(loocv_tbl))
## Confusion Matrix and Statistics
## 
##    
##      0  1
##   0 11  5
##   1  6  4
##                                           
##                Accuracy : 0.5769          
##                  95% CI : (0.3692, 0.7665)
##     No Information Rate : 0.6538          
##     P-Value [Acc > NIR] : 0.8485          
##                                           
##                   Kappa : 0.0892          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.6471          
##             Specificity : 0.4444          
##          Pos Pred Value : 0.6875          
##          Neg Pred Value : 0.4000          
##              Prevalence : 0.6538          
##          Detection Rate : 0.4231          
##    Detection Prevalence : 0.6154          
##       Balanced Accuracy : 0.5458          
##                                           
##        'Positive' Class : 0               
## 
testdata$label <- ifelse(testdata$label == 'BLUE', 1, 0)

cvjdata <- data.frame(cvjdata)

df.perf <- as.data.frame(do.call('cbind',lapply(cvjdata$m, FUN=function(m,data=testdata)
{
  v <- predict(m,data[,-c(3)],type='raw')
  lbllist <- unlist(apply(round(v), 1, which.max))-1
})))

### Aggregate
np <- ncol(df.perf)

predclass <- unlist(apply(df.perf,1,FUN=function(v){ ifelse(sum(v[2:length(v)])/np<0.5,0,1)}))

loocvtbl <- table(testdata[,3], predclass)

(loocv_cfm <- caret::confusionMatrix(loocvtbl))
## Confusion Matrix and Statistics
## 
##    predclass
##     0 1
##   0 4 2
##   1 3 1
##                                           
##                Accuracy : 0.5             
##                  95% CI : (0.1871, 0.8129)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.9527          
##                                           
##                   Kappa : -0.087          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.5714          
##             Specificity : 0.3333          
##          Pos Pred Value : 0.6667          
##          Neg Pred Value : 0.2500          
##              Prevalence : 0.7000          
##          Detection Rate : 0.4000          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.4524          
##                                           
##        'Positive' Class : 0               
## 

Conclusion

The Bagging method produced accuracy of 0.80 and LOOCV produced accuracy of 0.50. Here both models performed differently with bagging scoring better. Bagging is a method to reduce overfitting. You train many models on resampled data and then take their average to get an averaged model. This model is less susceptible to overfitting than the individual models you’ve fit. LOOCV cross validation, on the other hand, is used to estimate the out of sample accuracy.