# Modeling packages
library(caret) # for general model fitting
library(rpart) # for fitting decision trees
library(ipred) # for fitting bagged decision trees
library(bootstrap)
library(e1071)
library(tidyverse)
library(cvAUC)
library(pROC)
The dataset being used has two independent variables and one categorical dependent variable. The dependent variable has two categories: black or blue.
fl <- "https://raw.githubusercontent.com/mkivenson/Machine-Learning-Big-Data/master/HW1_Data.csv"
df <- read.csv(fl)
head(df)
## X Y label
## 1 5 a BLUE
## 2 5 b BLACK
## 3 5 c BLUE
## 4 5 d BLACK
## 5 5 e BLACK
## 6 5 f BLACK
This is a basic dataset, but some exploration is still helpful. A count of label distributions shows that there is more black labels than blue ones, though the classes arenโt extremely imbalanced. When creating a test train split, stratification can be used for more accuracy.
df %>%
count(label)
## # A tibble: 2 x 2
## label n
## <fct> <int>
## 1 BLACK 22
## 2 BLUE 14
Here we do a 30-70 test train split.
set.seed(45)
train.index <- createDataPartition(df$label, p = 0.7, list = FALSE)
train <- df[ train.index,]
test <- df[-train.index,]
According to the (https://cran.r-project.org/web/packages/ipred/ipred.pdf)[ipred library documentation] for the bagging function, the usual bootstrap n out of n with replacement is performed by default.
set.seed(42)
bagging_model <- bagging(label ~ .,
data = train,
nbagg = 100, #how many iterations to include in the bagged model
coob = TRUE, #out-of-bag error rate
)
bagging_model
##
## Bagging classification trees with 100 bootstrap replications
##
## Call: bagging.data.frame(formula = label ~ ., data = train, nbagg = 100,
## coob = TRUE)
##
## Out-of-bag estimate of misclassification error: 0.2308
bg.ypred <- predict(bagging_model, test)
cm_bg <- as.matrix.data.frame(table(bg.ypred, test$label))
rownames(cm_bg) <- c('predicted negative', 'predicted positive')
colnames(cm_bg) <- c('actual negative', 'actual positive')
cm_bg
## actual negative actual positive
## predicted negative 5 4
## predicted positive 1 0
TN <- cm_bg[1,1]
FN <- cm_bg[1,2]
FP <- cm_bg[2,1]
TP <- cm_bg[2,2]
y_act <- ifelse(test$label == 'BLUE', 1, 0)
AUC_BG <- auc(roc(bg.ypred, y_act))
TPR_BG <- TP / (TP + FN)
TNR_BG <- TN / (TN + FP)
FNR_BG <- 1 - TPR_BG
FPR_BG <- 1 - TNR_BG
ACC_BG <- (TP + TN) / (TP + TN + FP + FN)
print(c(AUC_BG, ACC_BG, TPR_BG, FPR_BG, TNR_BG, FNR_BG))
## [1] 0.2777778 0.5000000 0.0000000 0.1666667 0.8333333 1.0000000
CODE IS FROM YOUR LEARNING MODULE
N<-nrow(train)
train$label <- ifelse(train$label == 'BLUE', 1, 0)
cv_df <- do.call('rbind',lapply(1:N,FUN=function(idx,data=train) { ### Iterate Over All Points
### Keep One Observation as Test
m <- naiveBayes(label~., data = data[-idx,])
### Train Using the Rest of Observations, predict that one observation
p <- predict(m, data[idx,-c(3)], type='raw')
# NB returns the probabilities of the classes,
# as per Bayesian Classifier, we take the classs with the higher probability
pc <- unlist(apply(round(p), 1, which.max))-1
list("fold"=idx, "m"=m, "predicted"=pc, "actual" = data[idx,c(3)])
}
))
cv_df<-as.data.frame(cv_df)
loocv_tbl<-table(as.numeric(cv_df$actual),as.numeric(cv_df$predicted))
(loocv_caret_cfm<-caret::confusionMatrix(loocv_tbl))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 13 3
## 1 5 5
##
## Accuracy : 0.6923
## 95% CI : (0.4821, 0.8567)
## No Information Rate : 0.6923
## P-Value [Acc > NIR] : 0.5941
##
## Kappa : 0.3247
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.7222
## Specificity : 0.6250
## Pos Pred Value : 0.8125
## Neg Pred Value : 0.5000
## Prevalence : 0.6923
## Detection Rate : 0.5000
## Detection Prevalence : 0.6154
## Balanced Accuracy : 0.6736
##
## 'Positive' Class : 0
##
test$label <- ifelse(test$label == 'BLUE', 1, 0)
cv_df <- data.frame(cv_df)
df.perf<-as.data.frame(do.call('cbind',lapply(cv_df$m,FUN=function(m,data=test)
{
### Determine Test Metrics
v <- predict(m,data[,-c(3)],type='raw')
lbllist <- unlist(apply(round(v), 1, which.max))-1
}
)))
### Aggregate Test Metrics
np <- ncol(df.perf)
predclass <- unlist(apply(df.perf,1,FUN=function(v){ ifelse(sum(v[2:length(v)])/np<0.5,0,1)}))
loocvtbl <- table(test[,3], predclass)
(loocv_cfm<-caret::confusionMatrix(loocvtbl))
## Confusion Matrix and Statistics
##
## predclass
## 0 1
## 0 4 2
## 1 4 0
##
## Accuracy : 0.4
## 95% CI : (0.1216, 0.7376)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 0.9991
##
## Kappa : -0.3636
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.5000
## Specificity : 0.0000
## Pos Pred Value : 0.6667
## Neg Pred Value : 0.0000
## Prevalence : 0.8000
## Detection Rate : 0.4000
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.2500
##
## 'Positive' Class : 0
##
Both the bagging and LOOCV methods did not produce good accuracy; 0.5 and 0.4, respectively. Compared to previous models, bagging performed better than Logistic Regression and Naive Bayes, but significantly worse than KNN. LOOCV had a poor accuracy due to potential overfitting, but it was still better than Naive Bayes without cross validation, which only had an accuracy of 0.3. Although bagging and LOOCV methods performed better than the weak learners, the models are still sub-par. This is most likely because bagging or boosting a model with poor performance does not always yield better results.