Libraries

library(tidyverse)
library(ipred)
library(e1071)
library(caret)
library(knitr)

Load Data

df <- read.csv("Homework01.csv", header = T, sep = ",", stringsAsFactors=T)

Converting X variable into factor

df$X <- factor(df$X)

Preliminary EDA

str(df)

## 'data.frame':    36 obs. of  3 variables:
##  $ X    : Factor w/ 6 levels "5","19","35",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ Y    : Factor w/ 6 levels "a","b","c","d",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ label: Factor w/ 2 levels "BLACK","BLUE": 2 1 2 1 1 1 2 2 2 2 ...

head(df)

##   X Y label
## 1 5 a  BLUE
## 2 5 b BLACK
## 3 5 c  BLUE
## 4 5 d BLACK
## 5 5 e BLACK
## 6 5 f BLACK

dim(df)

## [1] 36  3

table(df$label)

## 
## BLACK  BLUE 
##    22    14

sum(table(df$label))

## [1] 36

I am skipping further EDA as i did the EDA on same data for homework01.

Split Data

Let’s split the data into training and test data sets in 70/30 ratio.

set.seed(43)
tstidx <- sample(1:nrow(df),0.30*nrow(df),replace = F)
trdata <- df[-tstidx,]
tsdata <- df[tstidx,]

Bagging

bagging_model <- bagging(label ~ ., data=trdata, nbagg = 100, coob = TRUE)
bagging_model

## 
## Bagging classification trees with 100 bootstrap replications 
## 
## Call: bagging.data.frame(formula = label ~ ., data = trdata, nbagg = 100, 
##     coob = TRUE)
## 
## Out-of-bag estimate of misclassification error:  0.1923

Out-of-bag estimate of misclassification error is 0.1923 which is fairly good. Let’s check the accuracy with training data before move forward to testing.

bagging_predict_training <- predict(bagging_model, trdata)

bagging_cfm_training <- table(bagging_predict_training, trdata$label)
bagging_cfm_training

##                         
## bagging_predict_training BLACK BLUE
##                    BLACK    14    0
##                    BLUE      0   12

bagging_acc <- sum(diag(bagging_cfm_training))/sum(bagging_cfm_training)
bagging_acc

## [1] 1

Accuracy is 100%. But this is from data the model has already seen. A Model with an accuracy of 70% or higher is performant, and therefore, is not under-fitting. We can surmise the model is capable of learning. It is better than random guess which would yield 50% accuracy.

Now let’s estimate metrics using test dataset:

bagging_predict <- predict(bagging_model, tsdata)

bagging_cfm <- table(bagging_predict, tsdata$label)
bagging_cfm

##                
## bagging_predict BLACK BLUE
##           BLACK     5    1
##           BLUE      3    1

bagging_acc <- sum(diag(bagging_cfm))/sum(bagging_cfm)
bagging_acc

## [1] 0.6

tpr_bagging <- bagging_cfm[1,1]/sum(bagging_cfm[1,1], bagging_cfm[2,1])
fpr_bagging <- bagging_cfm[1,2]/sum(bagging_cfm[1,2], bagging_cfm[2,2])
tnr_bagging <- 1 - fpr_bagging
fnr_bagging <- 1 - tpr_bagging

library(pROC)
roc_bagging <- roc(tsdata$label, as.numeric(bagging_predict))
auc_bagging <- roc_bagging$auc
bagging <- c("Bagging",round(auc_bagging,2),round(bagging_acc,2),round(tpr_bagging,2),round(fpr_bagging,2),round(tnr_bagging,2),round(fnr_bagging,2))

table <- data.frame(matrix(ncol = 6, nrow = 0))
table <- rbind(table,bagging)
colnames(table) <- c("ALGO", "AUC","ACCURACY", "TPR", "FPR", "TNR", "FNR")
library(knitr)
kable(table)

ALGO	AUC	ACCURACY	TPR	FPR	TNR	FNR
Bagging	0.56	0.6	0.62	0.5	0.5	0.38

LOOCV (Jacknife)

I will use the same code provided in the learning module.

N <- nrow(trdata)
print(N)

## [1] 26

trdata$label <- ifelse(trdata$label == 'BLACK', 1, 0)

cv_df <- do.call('rbind',lapply(1:N,FUN=function(idx, data=trdata) { 
  m <- naiveBayes(label~., data = data[-idx,]) 
  p <- predict(m, data[idx,-c(3)], type='raw') 
  pc <- unlist(apply(round(p), 1, which.max))-1 
  list("fold"=idx, "m"=m, "predicted"=pc, "actual" = data[idx,c(3)])
  }
))

head(cv_df)

##      fold m      predicted actual
## [1,] 1    List,5 1         0     
## [2,] 2    List,5 0         0     
## [3,] 3    List,5 0         1     
## [4,] 4    List,5 0         0     
## [5,] 5    List,5 0         0     
## [6,] 6    List,5 0         0

cv_df now has the folds, models and the instance that was held out and the predicted label for that held out observation. We can extract the metrics into a data.frame and average them.

cv_df<-as.data.frame(cv_df)

loocv_tbl <- table(as.numeric(cv_df$actual), as.numeric(cv_df$predicted))

(loocv_caret_cfm <- caret::confusionMatrix(loocv_tbl))

## Confusion Matrix and Statistics
## 
##    
##      0  1
##   0 10  2
##   1  3 11
##                                           
##                Accuracy : 0.8077          
##                  95% CI : (0.6065, 0.9345)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 0.001247        
##                                           
##                   Kappa : 0.6154          
##                                           
##  Mcnemar's Test P-Value : 1.000000        
##                                           
##             Sensitivity : 0.7692          
##             Specificity : 0.8462          
##          Pos Pred Value : 0.8333          
##          Neg Pred Value : 0.7857          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3846          
##    Detection Prevalence : 0.4615          
##       Balanced Accuracy : 0.8077          
##                                           
##        'Positive' Class : 0               
##

Test metrics

tsdata$label <- ifelse(tsdata$label == 'BLACK', 1, 0)

cv_df <- data.frame(cv_df)
df.perf<-as.data.frame(do.call('cbind',lapply(cv_df$m,FUN=function(m,data=tsdata)
{
   ### Determine Test Metrics
  v <- predict(m,data[,-c(3)],type='raw')
  lbllist <- unlist(apply(round(v), 1, which.max))-1
 
}
  )))

### Aggregate Test Metrics
np <- ncol(df.perf)
predclass <- unlist(apply(df.perf,1,FUN=function(v){ ifelse(sum(v[2:length(v)])/np<0.5,0,1)}))
loocvtbl <- table(tsdata[,3], predclass)
(loocv_cfm<-caret::confusionMatrix(loocvtbl))

## Confusion Matrix and Statistics
## 
##    predclass
##     0 1
##   0 1 1
##   1 3 5
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.2624, 0.8784)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.6331          
##                                           
##                   Kappa : 0.0909          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.2500          
##             Specificity : 0.8333          
##          Pos Pred Value : 0.5000          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.4000          
##          Detection Rate : 0.1000          
##    Detection Prevalence : 0.2000          
##       Balanced Accuracy : 0.5417          
##                                           
##        'Positive' Class : 0               
##

print(paste('LOO-CV:',loocv_cfm$overall[1]))

## [1] "LOO-CV: 0.6"

Let’s also run naive bayes (base model).

nb.model<-naiveBayes(label~.,data=trdata)
nb.tstpred<-predict(nb.model,tsdata[,-c(3)],type='raw')
nb.tstclass<-unlist(apply(round(nb.tstpred),1,which.max))-1
nb.tbl<-table(tsdata[[3]], nb.tstclass)
nb.cfm<-caret::confusionMatrix(nb.tbl)
nb.cfm

## Confusion Matrix and Statistics
## 
##    nb.tstclass
##     0 1
##   0 1 1
##   1 3 5
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.2624, 0.8784)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.6331          
##                                           
##                   Kappa : 0.0909          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.2500          
##             Specificity : 0.8333          
##          Pos Pred Value : 0.5000          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.4000          
##          Detection Rate : 0.1000          
##    Detection Prevalence : 0.2000          
##       Balanced Accuracy : 0.5417          
##                                           
##        'Positive' Class : 0               
##

print(paste('Base NB',nb.cfm$overall[[1]]))

## [1] "Base NB 0.6"

Now let’s bring metrics from HW-1.

Homework-1

Summary

Both bagging and LOOCV (Jacknife) produced same accuracy (0.6) with test data
The accuracy was not improved compared to homework-1 by using bagging and LOOCV (Jacknife)
Both LOOCV (Jacknife) and base naive bayes produced same P-value (0.6331) which indicates no improvement brought by LOOCV (Jacknife)

DATA 624: Test 01