Data summary

df <- data.frame(
  X = as.factor(c(5, 5, 5, 5, 5, 5, 19, 19, 19, 19, 19, 19, 35, 35, 35, 35, 35, 35, 51, 51, 51, 51, 51, 51, 55, 55, 55, 55, 55, 55, 63, 63, 63, 63, 63, 63)),
  Y = as.factor(c("a","b","c","d","e","f","a","b","c","d","e","f","a","b","c","d","e","f","a","b","c","d","e","f","a","b","c","d","e","f","a","b","c","d","e","f")),
  label = as.factor(c("BLUE","BLACK","BLUE","BLACK","BLACK","BLACK","BLUE","BLUE","BLUE","BLUE","BLACK","BLUE","BLACK","BLACK","BLUE","BLACK","BLACK","BLACK","BLACK","BLACK","BLUE","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLUE","BLUE","BLUE","BLUE","BLUE"))
)


summary(df)
##   X     Y       label   
##  5 :6   a:6   BLACK:22  
##  19:6   b:6   BLUE :14  
##  35:6   c:6             
##  51:6   d:6             
##  55:6   e:6             
##  63:6   f:6

Base Modeling

In this section we will apply 3 learning algorithms to the dataset: Logistic Regression , Naive Bayes , KNN. We will split dataset into train and test following training and evaluation framework provided in the caret library for each alogirithm we produce: * Training summary * Model Summary * Confusion Matrix based on model predictions for the dataset.

Logistic regression (glm with family = “binomial”): no parameter tuning

Naive Bayes naiveBayes from e1071

K-nearest neighbors (knn from base R): tuning of k parameter

# Data Preparation and Training/Test datasets split

set.seed(200)
respCol <- ncol(df)[[1]]
train <- createDataPartition(df[,respCol], p = .70)
obs <- df[-train$Resample1, respCol]

perfALG <- c("LR", "NB",  "KNN")
perfAUC = numeric()
perfACC = numeric()
perfTPR = numeric()
perfFPR = numeric()
## Logistic Regression (LR) Model
set.seed(203)

lrFit <- glm(label~., data = df[train$Resample1, ], family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(lrFit)
lrProb <- predict(lrFit, newdata = df[-train$Resample1,], type = "response")
#contrasts(df[, respCol])
lrPred <- rep("BLACK", length(lrProb))
lrPred[lrProb > 0.5] = "BLUE"
lrPred = as.factor(lrPred)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = lrPred))
perfACC <- c(perfACC, postResample(lrPred, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(lrPred, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(lrPred, obs))
table(lrPred, obs)
##        obs
## lrPred  BLACK BLUE
##   BLACK     4    0
##   BLUE      2    4
cm_LOGR <- table(lrPred, obs)

auc_LOGR <- auc(actual = obs, predicted = lrPred)
acc_LOGR <- sum(diag(cm_LOGR)) / sum(cm_LOGR)
tpr_LOGR <- cm_LOGR[1,1]/sum(cm_LOGR[1,1], cm_LOGR[2,1])
fpr_LOGR <- cm_LOGR[1,2]/sum(cm_LOGR[1,2], cm_LOGR[2,2])
tnr_LR <- cm_LOGR[2,2]/sum(cm_LOGR[2,2], cm_LOGR[1,2])
fnr_LR <- cm_LOGR[2,1]/sum(cm_LOGR[2,1], cm_LOGR[1,1])


LRrow <- c("LOGR",round(auc_LOGR,2),round(acc_LOGR,2),round(tpr_LOGR,2),round(fpr_LOGR,2), round(tnr_LR,2), round(fnr_LR,2))
set.seed(201)

nbFit <- naiveBayes(label ~ ., data = df[train$Resample1, ])
#print(nbFit)
nbPred <- predict(nbFit, newdata = df[-train$Resample1,], type = "class")
#postResample(nbPred, obs)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = nbPred))
perfACC <- c(perfACC, postResample(nbPred, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(nbPred, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(nbPred, obs))


cm_NAIVEB <-  table(nbPred, obs)
auc_NAIVEB <- auc(actual = obs, predicted = nbPred)
acc_NAIVEB <- sum(diag(cm_NAIVEB))/sum(cm_NAIVEB)
tpr_NAIVEB <- cm_NAIVEB[1,1]/sum(cm_NAIVEB[1,1], cm_NAIVEB[2,1])
fpr_NAIVEB <- cm_NAIVEB[1,2]/sum(cm_NAIVEB[1,2], cm_NAIVEB[2,2])
#tnr_NAIVEB<- 1 - fpr_NAIVEB
tnr_NAIVEB <- cm_NAIVEB[2,2]/sum(cm_NAIVEB[2,2], cm_NAIVEB[1,2])
fnr_NAIVEB <- cm_NAIVEB[2,1]/sum(cm_NAIVEB[2,1], cm_NAIVEB[1,1])

NAIVEBrow <- c("NAIVEBAYES",round(auc_NAIVEB,2),round(acc_NAIVEB,2),round(tpr_NAIVEB,2),round(fpr_NAIVEB,2), round(tnr_NAIVEB,2), round(fnr_NAIVEB,2))
# KNN 3

set.seed(300)
knnFit <- knn3(label ~., data = df[train$Resample1,], k=3)
knnPred <- predict(knnFit, newdata = df[-train$Resample1,], type = "class")
#postResample(knnPred, obs)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = knnPred))
perfACC <- c(perfACC, postResample(knnPred, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(knnPred, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(knnPred, obs))
table(knnPred, obs)
##        obs
## knnPred BLACK BLUE
##   BLACK     5    0
##   BLUE      1    4
cm_KNN <-  table(knnPred, obs)

auc_knn <- auc(actual = obs, predicted = knnPred)
acc_KNN <- sum(diag(cm_KNN))/sum(cm_KNN)
tpr_KNN <- cm_KNN[1,1]/sum(cm_KNN[1,1], cm_KNN[2,1])
fpr_KNN <- cm_KNN[1,2]/sum(cm_KNN[1,2], cm_KNN[2,2])
#tnr_KNN <- 1 - fpr_KNN
tnr_KNN <- cm_KNN[2,2]/sum(cm_KNN[2,2], cm_KNN[1,2])
fnr_KNN <- cm_KNN[2,1]/sum(cm_KNN[2,1], cm_KNN[1,1])

KNNrow <- c("KNN 3",round(auc_knn,2),round(acc_KNN,2),round(tpr_KNN,2),round(fpr_KNN,2), round(tnr_KNN,2),round(fnr_KNN,2) )


# KNN 5
set.seed(290)
data_train <-df[train$Resample1,]
data_train_knn<-df[train$Resample1,]
data_train_knn$Y<-apply(as.data.frame(data_train_knn$Y),1,utf8ToInt)
data_test_knn<-df[-train$Resample1,]
data_test_knn$Y<-apply(as.data.frame(data_test_knn$Y),1,utf8ToInt)

knn_model <- class::knn(cl = data_train$label,
                  test = data_test_knn[,1:2],
                  k = 5,
                  train = data_train_knn[,1:2],
                  prob = TRUE)

(confusion_matrix_KNN<-table(knn_model,data_test_knn[,3]))
##          
## knn_model BLACK BLUE
##     BLACK     5    1
##     BLUE      1    3
# Calculating  the ACC,TPR,FPR,TNR & FNR from confusion matrix
acc_KNN5 <- sum(diag(confusion_matrix_KNN)) / sum(confusion_matrix_KNN)

tpr_KNN5 <- confusion_matrix_KNN[1,1]/sum(confusion_matrix_KNN[1,1], confusion_matrix_KNN[2,1])
fpr_KNN5 <- confusion_matrix_KNN[1,2]/sum(confusion_matrix_KNN[1,2], confusion_matrix_KNN[2,2])
tnr_KNN5 <- confusion_matrix_KNN[2,2]/sum(confusion_matrix_KNN[2,2], confusion_matrix_KNN[1,2])
fnr_KNN5 <- confusion_matrix_KNN[2,1]/sum(confusion_matrix_KNN[2,1], confusion_matrix_KNN[1,1])

KNNrow5 <- c("KNN 5",round(auc_knn,2),round(acc_KNN5,2),round(tpr_KNN5,2),round(fpr_KNN5,2), round(tnr_KNN5,2),round(fnr_KNN5,2) )

Performance metric calculated using caret .

perf <- data.frame(
  ALGO = perfALG,
  AUC = perfAUC,
  ACC = perfACC,
  TPR = perfTPR,
  FPR = perfFPR
)
perf
##   ALGO       AUC ACC       TPR FPR
## 1   LR 0.8333333 0.8 0.6666667 0.0
## 2   NB 0.6666667 0.7 0.8333333 0.5
## 3  KNN 0.9166667 0.9 0.8333333 0.0

(B) Run LOOCV (jacknife) for the same dataset

— iterate over all points

 -- keep one observation as test

-- train using the rest of the observations

-- determine test metrics

-- aggregate the test metrics

end of loop

find the average of the test metric(s)

Compare (A), (B) above with the results you obtained in HW-1 and write 3 sentences explaining the

observed difference.

set.seed(45)
train.index <- createDataPartition(df$label, p = 0.7, list = FALSE)
train <- df[ train.index,]
test  <- df[-train.index,]

N<-nrow(train)
train$label <- ifelse(train$label == 'BLUE', 1, 0)
cv_df  <- do.call('rbind',lapply(1:N,FUN=function(idx,data=train) { ### Iterate Over All Points
   ### Keep One Observation as Test
   m <- naiveBayes(label~., data = data[-idx,])
   ### Train Using the Rest of Observations, predict that one observation
   p <- predict(m, data[idx,-c(3)], type='raw') 
   # NB returns the probabilities of the classes, 
   # as per Bayesian Classifier, we take the classs with the higher probability
   pc <- unlist(apply(round(p), 1, which.max))-1 
   list("fold"=idx, "m"=m, "predicted"=pc, "actual" = data[idx,c(3)])
  }
))

cv_df<-as.data.frame(cv_df)
loocv_tbl<-table(as.numeric(cv_df$actual),as.numeric(cv_df$predicted))
(loocv_caret_cfm<-caret::confusionMatrix(loocv_tbl))
## Confusion Matrix and Statistics
## 
##    
##      0  1
##   0 13  3
##   1  5  5
##                                           
##                Accuracy : 0.6923          
##                  95% CI : (0.4821, 0.8567)
##     No Information Rate : 0.6923          
##     P-Value [Acc > NIR] : 0.5941          
##                                           
##                   Kappa : 0.3247          
##                                           
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.7222          
##             Specificity : 0.6250          
##          Pos Pred Value : 0.8125          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.6923          
##          Detection Rate : 0.5000          
##    Detection Prevalence : 0.6154          
##       Balanced Accuracy : 0.6736          
##                                           
##        'Positive' Class : 0               
## 
test$label <- ifelse(test$label == 'BLUE', 1, 0)
cv_df <- data.frame(cv_df)
df.perf<-as.data.frame(do.call('cbind',lapply(cv_df$m,FUN=function(m,data=test)
{
   ### Determine Test Metrics
  v <- predict(m,data[,-c(3)],type='raw')
  lbllist <- unlist(apply(round(v), 1, which.max))-1
 
}
  )))

### Aggregate Test Metrics
np <- ncol(df.perf)
predclass <- unlist(apply(df.perf,1,FUN=function(v){ ifelse(sum(v[2:length(v)])/np<0.5,0,1)}))
loocvtbl <- table(test[,3], predclass)
(loocv_cfm<-caret::confusionMatrix(loocvtbl))
## Confusion Matrix and Statistics
## 
##    predclass
##     0 1
##   0 4 2
##   1 2 2
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.2624, 0.8784)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.6331          
##                                           
##                   Kappa : 0.1667          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.5000          
##          Pos Pred Value : 0.6667          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.6000          
##          Detection Rate : 0.4000          
##    Detection Prevalence : 0.6000          
##       Balanced Accuracy : 0.5833          
##                                           
##        'Positive' Class : 0               
## 
loocvtbl <- table(test[,3], predclass)



# Calculating  the ACC,TPR,FPR,TNR & FNR from confusion matrix
auc_NBC <- auc(actual = obs, predicted = predclass)
acc_NBC <- sum(diag(loocvtbl)) / sum(loocvtbl)
tpr_NBC <- loocvtbl[1,1]/sum(loocvtbl[1,1], loocvtbl[2,1])
fpr_NBC <- loocvtbl[1,2]/sum(loocvtbl[1,2], loocvtbl[2,2])
tnr_NBC <- loocvtbl[2,2]/sum(loocvtbl[2,2], loocvtbl[1,2])
fnr_NBC <- loocvtbl[2,1]/sum(loocvtbl[2,1], loocvtbl[1,1])

# Putting all the values for Logistic regression into a row.
NB_LOOCVrow <- c("NB with LOOCV ",round(auc_NBC,2), round(acc_NBC,2),round(tpr_NBC,2),round(fpr_NBC,2), round(tnr_NBC,2),round(fnr_NBC,2))

(A) Run Bagging (ipred package)

– sample with replacement

– estimate metrics for a model

– repeat as many times as specied and report the average

It is difficult to declare evident winner among the classifier. as multiple runs will select different training and testing data. This influence training and performance of each model during each run.As per perfoomance reading we can see Linear Regression performed best with good AUC , TPR and FPR.

results <- data.frame(matrix(ncol = 6, nrow = 0))
results <- rbind(results,LRrow,NAIVEBrow,KNNrow, KNNrow5  , NB_LOOCVrow)
colnames(results) <- c("ALGO", "AUC","ACC", "TPR", "FPR", "TNR", "FNR")
#results
kable(results[order(-as.numeric(results$ACC)),])
ALGO AUC ACC TPR FPR TNR FNR
3 KNN 3 0.92 0.9 0.83 0 1 0.17
1 LOGR 0.83 0.8 0.67 0 1 0.33
4 KNN 5 0.92 0.8 0.83 0.25 0.75 0.17
2 NAIVEBAYES 0.67 0.7 0.83 0.5 0.5 0.17
5 NB with LOOCV 0.38 0.6 0.67 0.5 0.5 0.33

Logistic Regression - The GLM linear regression model shows that the highest accuracy , AUC and TPR. This tells us the model did an above average job at classfication.As shown in scatter plot performance profiles, which is expected to produce linear separation boundaries in the predictor X-Y space.

Naive Bayes - The NB model has the least performance stats which catergorizes it as an average model. Generally NB performs reasonably well on the training set . We should test performance over resampled test sets. For this model, tuning should be performed through cross-validation to determine the predictor distribution assumption, which can be chosen as a kernel density estimate (rather than the normal distribution).

KNN (3,5) - The KNN model accuracy is same as LR 0.8 but AUC is less than LR .79 and TPR is among the highest.Appears to perform in line with the linear classifiers (logistic Regression).The model was tuned to determine the optimal value of k, which was chosen to be 5. Perhaps with a larger dataset, performance of the kNN classifier would be more clearly distinguishable versus the linear classifiers.

Conclusion

The Bagging method produced accuracy of 0.79 and LOOCV produced accuracy of 0.38. Both models perform differently with bagging scoring better. Bagging is a method to reduce overfitting. You train many models on resampled data and then take their average to get an averaged model. This model is less susceptible to overfitting than the individual models you’ve fit. LOOCV cross validation, on the other hand, is used to estimate the out of sample accuracy.