library(ipred)
library(e1071)
path <- "file:///Users/bobo/Documents/622/HW1/15001956_p1_dataset_hw1.csv"
data <-read.csv(path, header=TRUE, sep=',', stringsAsFactors=FALSE)
head(data)
## X Y label
## 1 5 a BLUE
## 2 5 b BLACK
## 3 5 c BLUE
## 4 5 d BLACK
## 5 5 e BLACK
## 6 5 f BLACK
str(data)
## 'data.frame': 36 obs. of 3 variables:
## $ X : int 5 5 5 5 5 5 19 19 19 19 ...
## $ Y : chr "a" "b" "c" "d" ...
## $ label: chr "BLUE" "BLACK" "BLUE" "BLACK" ...
data$Y <- as.factor(data$Y)
data$label <- as.factor(data$label)
table(data$label)
##
## BLACK BLUE
## 22 14
Let us split our data into 70% training and 30% testing datasets.
set.seed(11)
tstidx <- caret::createDataPartition(data$label, p=0.7, list=FALSE)
trdata <- data[tstidx,]
tsdata <- data[-tstidx,]
summary(trdata)
## X Y label
## Min. : 5.00 a:4 BLACK:16
## 1st Qu.:19.00 b:5 BLUE :10
## Median :51.00 c:5
## Mean :40.23 d:6
## 3rd Qu.:55.00 e:4
## Max. :63.00 f:2
Bagging Model
bagging_model <- bagging(label ~ .,
data = trdata,
nbagg = 100,
coob = TRUE
)
bagging_model
##
## Bagging classification trees with 100 bootstrap replications
##
## Call: bagging.data.frame(formula = label ~ ., data = trdata, nbagg = 100,
## coob = TRUE)
##
## Out-of-bag estimate of misclassification error: 0.3077
Prediction vs actual
bagging_pred <- predict(bagging_model, tsdata)
bagging_table <- table(bagging_pred, tsdata$label)
bagging_table
##
## bagging_pred BLACK BLUE
## BLACK 3 2
## BLUE 3 2
tn1 <- bagging_table[1,1]
fn1 <- bagging_table[1,2]
fp1 <- bagging_table[2,1]
tp1 <- bagging_table[2,2]
accuracy1 <- (tp1+tn1)/(tp1+tn1+fp1+fn1)
tpr1 <- tp1/(tp1+fn1)
tnr1 <- tn1/(tn1+fp1)
fnr1 <- 1-tpr1
fpr1 <- 1-tnr1
# change label to numeric since predictor must be numeric or ordered
tsdata_num <- ifelse(tsdata$label == "BLUE", 1, 0)
auc1 <- pROC::auc(pROC::roc(bagging_pred, tsdata_num))
## Setting levels: control = BLACK, case = BLUE
## Setting direction: controls < cases
N<-nrow(trdata)
N
## [1] 26
trdata$label <- ifelse(trdata$label == 'BLUE', 1, 0)
cvjdata <- do.call('rbind',lapply(1:N,FUN=function(idx, data=trdata) {
m <- naiveBayes(label~., data = data[-idx,])
p <- predict(m, data[idx,-c(3)], type='raw')
pc <- unlist(apply(round(p), 1, which.max))-1
list("fold"=idx, "m"=m, "predicted"=pc, "actual" = data[idx,c(3)])
}
))
head(cvjdata)
## fold m predicted actual
## [1,] 1 List,5 0 0
## [2,] 2 List,5 1 1
## [3,] 3 List,5 1 0
## [4,] 4 List,5 0 0
## [5,] 5 List,5 0 1
## [6,] 6 List,5 0 1
cvjdata <- as.data.frame(cvjdata)
loocv_table <- table(as.numeric(cvjdata$actual), as.numeric(cvjdata$predicted))
loocv_cfm <- caret::confusionMatrix((loocv_table))
loocv_cfm
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 14 2
## 1 4 6
##
## Accuracy : 0.7692
## 95% CI : (0.5635, 0.9103)
## No Information Rate : 0.6923
## P-Value [Acc > NIR] : 0.2679
##
## Kappa : 0.4935
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.7778
## Specificity : 0.7500
## Pos Pred Value : 0.8750
## Neg Pred Value : 0.6000
## Prevalence : 0.6923
## Detection Rate : 0.5385
## Detection Prevalence : 0.6154
## Balanced Accuracy : 0.7639
##
## 'Positive' Class : 0
##
# now we have to apply the training models to testdata and average them
# since this is classification we will take the majority vote
# double loop
tsdata$label <- ifelse(tsdata$label == 'BLUE', 1, 0)
cvj.perf<-as.data.frame(do.call('cbind',lapply(cvjdata$m,FUN=function(m,data=tsdata)
{
v<-predict(m,data[,-c(3)],type='raw')
lbllist<-unlist(apply(round(v),1,which.max))-1
})))
np<-ncol(cvj.perf)
predclass<-unlist(apply(cvj.perf,1,FUN=function(v){ ifelse(sum(v[2:length(v)])/np<0.5,0,1)}))
loocv_table <-table(tsdata[,3],predclass)
(loocv_cfm2<-caret::confusionMatrix(loocv_table))
## Confusion Matrix and Statistics
##
## predclass
## 0 1
## 0 2 4
## 1 3 1
##
## Accuracy : 0.3
## 95% CI : (0.0667, 0.6525)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.9453
##
## Kappa : -0.4
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.4000
## Specificity : 0.2000
## Pos Pred Value : 0.3333
## Neg Pred Value : 0.2500
## Prevalence : 0.5000
## Detection Rate : 0.2000
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.3000
##
## 'Positive' Class : 0
##
auc2 <- pROC::auc(pROC::roc(predclass, tsdata_num))
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
Algorithm <- c("LR", "NB", "kNN", "Bagging", "LOOCV")
AUC <- c(0.56, 0.52, 0.75, auc1, auc2)
Accuracy <- c(0.4, 0.7, 0.64, accuracy1, 0.3)
TPR <- c(0.4, 0.625, 0.67, tpr1, 0.33)
FPR <- c(1-0.4, 1-0.625, 1-0.67, fpr1, 1-0.33)
TNR <- c(0.4, 1, 0.5, tnr1, 0.25)
FNR <- c(1-0.4, 1-1, 1-0.5, fnr1, 1-0.25)
df <- data.frame(Algorithm, AUC, Accuracy, TPR, FPR, TNR, FNR)
df
## Algorithm AUC Accuracy TPR FPR TNR FNR
## 1 LR 0.56 0.40 0.400 0.600 0.40 0.60
## 2 NB 0.52 0.70 0.625 0.375 1.00 0.00
## 3 kNN 0.75 0.64 0.670 0.330 0.50 0.50
## 4 Bagging 0.50 0.50 0.500 0.500 0.50 0.50
## 5 LOOCV 0.70 0.30 0.330 0.670 0.25 0.75
Interestingly, Bagging resulted in 0.50 across all metrics in the table above, while LOOCV resulted in high AUC and low accuracy, TPR and TNR. Despite Bagging outperformed LOOCV, its accuracy is lower than both NB and kNN. Perhaps models with bagging and cross validation methods do not necessarily do better than simple models.