Homework 2 – Run the model exercise

Consider a dataset as shown below:

df <- data.frame(
  X = as.factor(c(5, 5, 5, 5, 5, 5, 19, 19, 19, 19, 19, 19, 35, 35, 35, 35, 35, 35, 51, 51, 51, 51, 51, 51, 55, 55, 55, 55, 55, 55, 63, 63, 63, 63, 63, 63)),
  Y = c("a","b","c","d","e","f","a","b","c","d","e","f","a","b","c","d","e","f","a","b","c","d","e","f","a","b","c","d","e","f","a","b","c","d","e","f"),
  label = c("BLUE","BLACK","BLUE","BLACK","BLACK","BLACK","BLUE","BLUE","BLUE","BLUE","BLACK","BLUE","BLACK","BLACK","BLUE","BLACK","BLACK","BLACK","BLACK","BLACK","BLUE","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLACK","BLUE","BLUE","BLUE","BLUE","BLUE")
)
df
##     X Y label
## 1   5 a  BLUE
## 2   5 b BLACK
## 3   5 c  BLUE
## 4   5 d BLACK
## 5   5 e BLACK
## 6   5 f BLACK
## 7  19 a  BLUE
## 8  19 b  BLUE
## 9  19 c  BLUE
## 10 19 d  BLUE
## 11 19 e BLACK
## 12 19 f  BLUE
## 13 35 a BLACK
## 14 35 b BLACK
## 15 35 c  BLUE
## 16 35 d BLACK
## 17 35 e BLACK
## 18 35 f BLACK
## 19 51 a BLACK
## 20 51 b BLACK
## 21 51 c  BLUE
## 22 51 d BLACK
## 23 51 e BLACK
## 24 51 f BLACK
## 25 55 a BLACK
## 26 55 b BLACK
## 27 55 c BLACK
## 28 55 d BLACK
## 29 55 e BLACK
## 30 55 f BLACK
## 31 63 a BLACK
## 32 63 b  BLUE
## 33 63 c  BLUE
## 34 63 d  BLUE
## 35 63 e  BLUE
## 36 63 f  BLUE
str(df)
## 'data.frame':    36 obs. of  3 variables:
##  $ X    : Factor w/ 6 levels "5","19","35",..: 1 1 1 1 1 1 2 2 2 2 ...
##  $ Y    : Factor w/ 6 levels "a","b","c","d",..: 1 2 3 4 5 6 1 2 3 4 ...
##  $ label: Factor w/ 2 levels "BLACK","BLUE": 2 1 2 1 1 1 2 2 2 2 ...
summary(df)
##   X     Y       label   
##  5 :6   a:6   BLACK:22  
##  19:6   b:6   BLUE :14  
##  35:6   c:6             
##  51:6   d:6             
##  55:6   e:6             
##  63:6   f:6

Run kNN, Tree, NB, LDA and LR, SVM with RBS Kernel and

determine the AUC, ACCURACY, TPR,FPR for each algorithm, create a table as shown below

ALGO AUC,ACC,TPR,FPR
LR
LDA
NB
SVM
kNN
TREE

Summarize and provide a explanatory commentary on the observed performance of these classifiers.
What aspects of the data and or aspects of the algorithms, explain these performance differences?

library(caret)
library(ModelMetrics)

#set.seed(200)
respCol <- ncol(df)[[1]]
train <- createDataPartition(df[,respCol], p = .70)
obs <- df[-train$Resample1, respCol]

perfALG <- c("LR", "LDA", "NB", "SVM", "KNN", "TREE")
perfAUC = numeric()
perfACC = numeric()
perfTPR = numeric()
perfFPR = numeric()
## Logistic Regression (LR) Model
lrFit <- glm(label~., data = df[train$Resample1, ], family = binomial)
#summary(lrFit)
lrProb <- predict(lrFit, newdata = df[-train$Resample1,], type = "response")
#contrasts(df[, respCol])
lrPred <- rep("BLACK", length(lrProb))
lrPred[lrProb > 0.5] = "BLUE"
lrPred = as.factor(lrPred)
#postResample(lrPred, obs)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = lrPred))
perfACC <- c(perfACC, postResample(lrPred, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(lrPred, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(lrPred, obs))
table(lrPred, obs)
##        obs
## lrPred  BLACK BLUE
##   BLACK     5    1
##   BLUE      1    3
## LDA Model
library(MASS)
ldaFit <- lda(label~., data = df[train$Resample1, ])
#ldaFit
ldaPred <- predict(ldaFit, newdata = df[-train$Resample1,])
#postResample(pred$class, obs)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = ldaPred$class))
perfACC <- c(perfACC, postResample(ldaPred$class, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(ldaPred$class, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(ldaPred$class, obs))
table(ldaPred$class, obs)
##        obs
##         BLACK BLUE
##   BLACK     5    3
##   BLUE      1    1
## Naive Bayes (NB) Model
library(e1071)
nbFit <- naiveBayes(label ~ ., data = df[train$Resample1, ])
#print(nbFit)
nbPred <- predict(nbFit, newdata = df[-train$Resample1,], type = "class")
#postResample(nbPred, obs)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = nbPred))
perfACC <- c(perfACC, postResample(nbPred, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(nbPred, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(nbPred, obs))
table(nbPred, obs)
##        obs
## nbPred  BLACK BLUE
##   BLACK     5    2
##   BLUE      1    2
svmFit <- svm(label~., data = df[train$Resample1,], kernel = "radial")
svmPred <- predict(svmFit, newdata = df[-train$Resample1,], type = "class")
#postResample(svmPred, obs)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = svmPred))
perfACC <- c(perfACC, postResample(svmPred, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(svmPred, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(svmPred, obs))
table(svmPred, obs)
##        obs
## svmPred BLACK BLUE
##   BLACK     6    4
##   BLUE      0    0
## KNN Model
knnFit <- knn3(label ~., data = df[train$Resample1,])
knnPred <- predict(knnFit, newdata = df[-train$Resample1,], type = "class")
#postResample(knnPred, obs)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = knnPred))
perfACC <- c(perfACC, postResample(knnPred, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(knnPred, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(knnPred, obs))
table(knnPred, obs)
##        obs
## knnPred BLACK BLUE
##   BLACK     5    2
##   BLUE      1    2
## Tree Model
library(tree)
treeFit <- tree(label~., data = df[train$Resample1,])
#summary(treeFit)
treePred <- predict(treeFit, newdata = df[-train$Resample1,], type = "class")
#postResample(treePred, obs)
perfAUC <- c(perfAUC, auc(actual = obs, predicted = treePred))
perfACC <- c(perfACC, postResample(treePred, obs)["Accuracy"])
perfTPR <- c(perfTPR, caret::sensitivity(treePred, obs))
perfFPR <- c(perfFPR, 1 - caret::specificity(treePred, obs))
table(treePred, obs)
##         obs
## treePred BLACK BLUE
##    BLACK     4    1
##    BLUE      2    3

Summary

perf <- data.frame(
  ALGO = perfALG,
  AUC = perfAUC,
  ACC = perfACC,
  TPR = perfTPR,
  FPR = perfFPR
)
perf
##   ALGO       AUC ACC       TPR  FPR
## 1   LR 0.7916667 0.8 0.8333333 0.25
## 2  LDA 0.5416667 0.6 0.8333333 0.75
## 3   NB 0.6666667 0.7 0.8333333 0.50
## 4  SVM 0.5000000 0.6 1.0000000 1.00
## 5  KNN 0.6666667 0.7 0.8333333 0.50
## 6 TREE 0.7083333 0.7 0.6666667 0.25

It is hard to determine a clear winner among the classifiers, as multiple runs will select different training and testing data and greatly influence the training and performance of each model from run to run. However, it is clear that SVM performed the worst, with AUC=0.5 and both TPR and FPR equal to 1. The amount of data (36 rows) and its structure is not ideal for any particular classifier. This simulated data has two predictors with values of discrete factor (i.e. ordinal) type. Perhaps the reason why SVM performs consistently the worst is that there is no margin in separating the data points, given that each value of one predictor is paired with exact same values of the other predictors (as in cartesian join). Such correlation in the predictors is likely to make classifiers unpredictable or unstable.