library(mlbench)
library(AppliedPredictiveModeling)
library(caret)
library(MASS)
library(nnet)
library(kernlab)
library(mda)
library(klaR)
library(pamr)
library(glmnet)
library(randomForest)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(patchwork)
library(partykit)
library(Cubist)
library(gbm)
library(earth)
library(modeldata)
library(corrplot)
library(pROC)
library(rpart)
library(rpart.plot)
In Homework 1, Problem 3, we described a data set which contained 96 oil samples each from one of seven types of oils (pumpkin, sunflower, peanut, olive, soybean,grapeseed, and corn). Gas chromatography was performed on each sample and the percentage of each type of 7 fatty acids was determined. We would like to use these data to build a model that predicts the type of oil based on a sample’s fatty acid percentages. These data can be found in the caret package using data(oil). The oil types are contained in a factor variable called oilType. The types are pumpkin (coded as A), sunflower (B), peanut (C), olive (D), soybean (E),grapeseed (F) and corn (G). In R,
data(oil)
str(oilType)
## Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
table(oilType)
## oilType
## A B C D E F G
## 37 26 3 7 11 10 2
fullData <- data.frame(fattyAcids, oilType = oilType)
trainIndex <- createDataPartition(fullData$oilType, p = 0.7, list = FALSE)
trainData <- fullData[trainIndex, ]
testData <- fullData[-trainIndex, ]
Response: Because oil type is imbalanced,we cannot use a simple random split due to the risk of underrepresented minority classes such as peanut or corn. In order to preserve class proportional representation, stratified random sampling was utilized as shown in the code above.
Response: Overall accuracy is misleading when encountering class imbalance because it can achieve high model accuracy based on soley the majority class. Instead, Cohen’s Kappa can be used in order to measure agreement between predicted and actual classes. Additionally, if class insight was preferred, macro F1 scores or balanced accuracy scores could serve the same purpose.
# Stratified Random Sampling
fullData <- data.frame(fattyAcids, oilType = oilType)
trainIndex <- createDataPartition(fullData$oilType, p = 0.7, list = FALSE)
trainData <- fullData[trainIndex, ]
testData <- fullData[-trainIndex, ]
# Resampling
ctrl <- trainControl (
method = "repeatedcv",
number = 5,
repeats = 3,
sampling = "up",
classProbs = TRUE,
summaryFunction = defaultSummary
)
# Preprocessing
preProc <- c("center", "scale")
# ---------- LDA Model ----------
set.seed(100)
ldaFit <- train(oilType ~ ., data = trainData,
method = "lda",
preProcess = preProc,
trControl = ctrl,
metric = "Kappa")
# ---------- Penalized Logistic Regression (GLMnet) ----------
# Key Tuning Param - alpha (mixing), lambda (penalty)
glmGrid <- expand.grid(alpha = c(0, 0.5, 1),
lambda = 10^seq(-4,-1, length = 20))
set.seed(101)
glmFit <- train(oilType ~ ., data = trainData,
method = "glmnet",
family = "multinomial",
preProcess = preProc,
tuneGrid = glmGrid,
trControl = ctrl,
metric = "Kappa")
# ---------- Nearest Shrunken Centroids ----------
# Key Tuning Param - shrinkage threshold
nscGrid <- data.frame(threshold = seq(0, 4, by = 0.1))
set.seed(102)
nscFit <- train(oilType ~ ., data = trainData,
method = "pam",
preProcess = preProc,
tuneGrid = nscGrid,
trControl = ctrl,
metric = "Kappa")
## 1111111111111111
# ---------- PLS-DA ----------
# Key Tuning Param - number of components
plsGrid <- data.frame(ncomp = 1:6)
set.seed(103)
plsFit <- train(oilType ~ ., data = trainData,
method = "pls",
preProcess = preProc,
tuneGrid = plsGrid,
trControl = ctrl,
metric = "Kappa")
resamps <- resamples(list(LDA = ldaFit, GLMnet = glmFit, NSC = nscFit, PLS = plsFit))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: LDA, GLMnet, NSC, PLS
## Number of resamples: 15
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.7142857 0.9285714 0.9375000 0.9467569 1 1 0
## GLMnet 0.8461538 1.0000000 1.0000000 0.9805372 1 1 0
## NSC 0.8333333 0.8976190 1.0000000 0.9514286 1 1 0
## PLS 0.8000000 0.9198718 0.9333333 0.9285046 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.6216216 0.9066625 0.9219512 0.9293750 1 1 0
## GLMnet 0.8015267 1.0000000 1.0000000 0.9751676 1 1 0
## NSC 0.7714286 0.8677929 1.0000000 0.9362194 1 1 0
## PLS 0.7413793 0.8923077 0.9107143 0.9074452 1 1 0
dotplot(resamps, metric = "Kappa")
ldaPred <- predict(ldaFit, testData)
glmPred <- predict(glmFit, testData)
nscPred <- predict(nscFit, testData)
plsPred <- predict(plsFit, testData)
cat("-------- LDA --------\n")
## -------- LDA --------
confusionMatrix(ldaPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("\n-------- Penalized Logistic Regression --------\n")
##
## -------- Penalized Logistic Regression --------
confusionMatrix(glmPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("\n-------- Nearest Shrunken Centroids --------\n")
##
## -------- Nearest Shrunken Centroids --------
confusionMatrix(nscPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("\n-------- PLSDA --------\n")
##
## -------- PLSDA --------
confusionMatrix(plsPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 10 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 1 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9615
## 95% CI : (0.8036, 0.999)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 7.058e-09
##
## Kappa : 0.9472
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 0.9091 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 0.9565 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 0.7500 1.0000
## Neg Pred Value 0.9375 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.3846 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.3846 0.2692 0 0.07692 0.1538 0.1154
## Balanced Accuracy 0.9545 1.0000 NA 1.00000 0.9783 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
Response: Using 15 resamples and upsampling to indcrease the sampling rate with centering and scaling as preprocessing metrics, GLMnet is the best model during cross validation with a highest mean accuracy of 0.977 and the highest mean Kappa of 0.971, with NSC, LDA and PLS performing slightly worse. On the test set, GLMnet achieved perfect classification while LDA and NSC performed similarly with slightly lower accuracy and PLS showing the weakest performance. However, because classes C and G were absent from the test set, leading to undefined performance metrics for those classes, the reported perfect accuracy of GLMnet may not be reliable.
Use the fatty acid data from Problem 1 above.
# Stratified Random Sampling
fullData <- data.frame(fattyAcids, oilType = oilType)
trainIndex <- createDataPartition(fullData$oilType, p = 0.7, list = FALSE)
trainData <- fullData[trainIndex, ]
testData <- fullData[-trainIndex, ]
# Resampling
ctrl <- trainControl (
method = "repeatedcv",
number = 5,
repeats = 3,
sampling = "up",
classProbs = TRUE,
summaryFunction = defaultSummary
)
# Preprocessing
preProc <- c("center", "scale")
#SVM
set.seed(100)
svmFit <- train(oilType ~ ., data = trainData,
method = "svmRadial",
preProcess = preProc,
tuneLength = 5,
trControl = ctrl,
metric = "Kappa",
prob.model = TRUE)
# KNN
knnGrid <- data.frame(k = 1:20)
set.seed(101)
knnFit <- train(oilType ~ ., data = trainData,
method = "knn",
preProcess = preProc,
tuneGrid = knnGrid,
trControl = ctrl,
metric = "Kappa")
# FDA
set.seed(102)
fdaFit <- train(oilType ~ ., data = trainData,
method = "fda",
preProcess = preProc,
trControl = ctrl,
metric = "Kappa")
#NNET
nnetGrid <- expand.grid(
size = c(1,3,5,7,10),
decay = c(0, 0.001, 0.01, 0.1, 1)
)
set.seed(103)
nnetFit <- train(oilType ~ ., data = trainData,
method = "nnet",
preProcess = preProc,
tuneGrid = nnetGrid,
trControl = ctrl,
metric = "Kappa",
MaxNWts = 2000,
trace = FALSE)
#MDA
mdaGrid <- data.frame(subclasses = 1:4)
set.seed(104)
mdaFit <- train(oilType ~ ., data = trainData,
method = "mda",
preProcess = preProc,
tuneGrid = mdaGrid,
trControl = ctrl,
metric = "Kappa")
resamps <- resamples(list(SVM = svmFit, KNN = knnFit, FDA = fdaFit, NNET = nnetFit, MDA = mdaFit))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: SVM, KNN, FDA, NNET, MDA
## Number of resamples: 15
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.5000000 0.5714286 0.6428571 0.6416167 0.7142857 0.75 0
## KNN 0.8000000 0.9330357 1.0000000 0.9604887 1.0000000 1.00 0
## FDA 0.8666667 0.9285714 0.9375000 0.9575305 1.0000000 1.00 0
## NNET 0.8666667 0.9666667 1.0000000 0.9758442 1.0000000 1.00 0
## MDA 0.7500000 0.9309524 0.9375000 0.9452595 1.0000000 1.00 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.2794118 0.4025605 0.4852941 0.4786576 0.5618781 0.6086957 0
## KNN 0.7500000 0.9134615 1.0000000 0.9483041 1.0000000 1.0000000 0
## FDA 0.8285714 0.9060233 0.9215686 0.9444211 1.0000000 1.0000000 0
## NNET 0.8235294 0.9581006 1.0000000 0.9682367 1.0000000 1.0000000 0
## MDA 0.6504854 0.9078471 0.9195980 0.9267003 1.0000000 1.0000000 0
dotplot(resamps, metric = "Kappa")
svmPred <- predict(svmFit, testData)
knnPred <- predict(knnFit, testData)
fdaPred <- predict(fdaFit, testData)
nnetPred <- predict(nnetFit, testData)
mdaPred <- predict(mdaFit, testData)
cat("-------- SVM --------\n")
## -------- SVM --------
confusionMatrix(svmPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 1 0 3 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 0 0 0 0
## E 0 0 0 0 0 0 0
## F 0 0 0 1 3 0 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6923
## 95% CI : (0.4821, 0.8567)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 0.005091
##
## Kappa : 0.5378
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 0.00000 0.0000 0.0000
## Specificity 0.7333 1.0000 1 1.00000 1.0000 0.8261
## Pos Pred Value 0.7333 1.0000 NA NaN NaN 0.0000
## Neg Pred Value 1.0000 1.0000 NA 0.92308 0.8846 0.8636
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.00000 0.0000 0.0000
## Detection Prevalence 0.5769 0.2692 0 0.00000 0.0000 0.1538
## Balanced Accuracy 0.8667 1.0000 NA 0.50000 0.5000 0.4130
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("\n-------- KNN --------\n")
##
## -------- KNN --------
confusionMatrix(knnPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("\n-------- FDA --------\n")
##
## -------- FDA --------
confusionMatrix(fdaPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("\n-------- NNET --------\n")
##
## -------- NNET --------
confusionMatrix(nnetPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("\n-------- MDA --------\n")
##
## -------- MDA --------
confusionMatrix(mdaPred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 10 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 1 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9615
## 95% CI : (0.8036, 0.999)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 7.058e-09
##
## Kappa : 0.9472
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 0.9091 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 0.9565 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 0.7500 1.0000
## Neg Pred Value 0.9375 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.3846 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.3846 0.2692 0 0.07692 0.1538 0.1154
## Balanced Accuracy 0.9545 1.0000 NA 1.00000 0.9783 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
Response: Using the same preprocessing steps and data splitting approach as before, NNET achieved the best predictive performance amongst the nonlinear models on both the cross validation and test set results, with the highest reported mean accuracy during cross validation of 0.989 and the highest kappa of 0.986. SVM achieved the lowest mean accuracy and kappa. However, like previously mentioned, classes C and G are missing in the test set which may skew results. Despite these results in comparison to the best linear model, penalized logistic regression (GLMnet) outperformed all nonlinear models, achieving perfect accuracy and Kappa.
Since the best linear model outperformed all nonlinear models, there is no strong evidence that nonlinear decision boundaries are required, The data appears to be well separated using linear models.
Looking at the NNET confusion matrix, the most accurately predicted classes were A, D, E, and F with perfect sensitivity while class B was least accurately predicted (lowest sensitivity). Class C and Class G (no samples) were omitted due to their absence in the test set.
The “churn” data set was developed to predict telecom customer churn based on information about their account. The data files state that the data are “artificial based on claims similar to real world.” The data consist of 19 predictors related to the customer account, such as the number of customer service calls, the area code, and the number of minutes. The outcome is whether the customer churned:
data(mlc_churn)
str(mlc_churn)
## tibble [5,000 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int [1:5000] 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int [1:5000] 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num [1:5000] 265 162 243 299 167 ...
## $ total_day_calls : int [1:5000] 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num [1:5000] 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num [1:5000] 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int [1:5000] 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num [1:5000] 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num [1:5000] 245 254 163 197 187 ...
## $ total_night_calls : int [1:5000] 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num [1:5000] 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num [1:5000] 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int [1:5000] 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num [1:5000] 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int [1:5000] 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
# Outcome Distribution
table(mlc_churn$churn)
##
## yes no
## 707 4293
prop.table(table(mlc_churn$churn))
##
## yes no
## 0.1414 0.8586
numVars <- mlc_churn %>% select(where(is.numeric))
corrMatrix <- cor(numVars)
corrplot(corrMatrix,method = "color", type = "upper", tl.cex = 0.7, tl.col = "black")
p1 <- ggplot(mlc_churn, aes(x = churn, y = total_day_minutes, fill = churn)) +
geom_boxplot() + labs(title = "Day minutes by churn")
p2 <- ggplot(mlc_churn, aes(x = churn, y = number_customer_service_calls, fill = churn)) +
geom_boxplot() + labs(title = "Customer service calls by churn")
p1 + p2
p3 <- ggplot(mlc_churn, aes(x = international_plan, fill = churn)) +
geom_bar(position = "fill") +
labs(title = "Churn rate by international plan", y = "proportion")
p4 <- ggplot(mlc_churn, aes(x = voice_mail_plan, fill = churn)) +
geom_bar(position = "fill") +
labs(title = "Churn rate by voicemail plan", y = "proportion")
p3 + p4
Response: In the correlation matrix above, we see near perfect correlation between total_day_minutes and total_day_charge, total_eve_minutes and total_night_charge, total_eve_charge and total_night_charge, total_intl_minutes and total_intl_charge. Because of this redundancy, charge variables can be removed. From the boxplots above, customers who churn tend to have higher total day minutes and more customer service calls. From the bar plots above, international_plan and voice_mail_plan are imbalanced and can affect model performance. Combining predictors could improve performance for example customers with high day minutes and high service calls may have a high probability of churn while interactions between usage variable and service related variables could capture more complex behaviors.
set.seed(123)
# Remove perfectly correlated vars
churnMod <- mlc_churn %>%
select(-total_day_charge, -total_eve_charge, -total_night_charge, -total_intl_charge, -state)
# Stratified split
trainIndex <- createDataPartition(churnMod$churn, p = 0.75, list = FALSE)
churnTrain <- churnMod[trainIndex, ]
churnTest <- churnMod[-trainIndex, ]
cat("------------- Class Preservation of 14% churn rate -----------")
## ------------- Class Preservation of 14% churn rate -----------
prop.table(table(churnTrain$churn))
##
## yes no
## 0.1415623 0.8584377
prop.table(table(churnTest$churn))
##
## yes no
## 0.1409127 0.8590873
cat("\n------------- Pre-processing -----------\n")
##
## ------------- Pre-processing -----------
nzv <- nearZeroVar(churnTrain, saveMetrics = TRUE)
print(nzv[nzv$nzv == TRUE, ])
## freqRatio percentUnique zeroVar nzv
## number_vmail_messages 43.1875 1.279659 FALSE TRUE
ctrl <- trainControl(
method = "cv",
number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = "final"
)
Response: Redundant charge columns were removed due to perfect correlation with minutes. A stratified split was also applied to preserve the 14% churn rate in both training and test sets. Train control was set to 10-fold CV.
set.seed(123)
#SVM
set.seed(100)
svmFit <- train(churn ~ ., data = churnTest,
method = "svmRadial",
preProcess = preProc,
tuneLength = 5,
trControl = ctrl,
metric = "ROC")
# KNN
knnGrid <- data.frame(k = c(3, 5, 7, 9, 11, 15, 21))
set.seed(101)
knnFit <- train(churn ~ ., data = churnTest,
method = "knn",
metric = "ROC",
preProcess = preProc,
tuneGrid = knnGrid,
trControl = ctrl)
# FDA
set.seed(102)
fdaFit <- train(churn ~ ., data = churnTest,
method = "fda",
preProcess = preProc,
trControl = ctrl,
metric = "ROC")
#NNET
nnetGrid <- expand.grid(
size = c(1,3,5,7,10),
decay = c(0, 0.001, 0.01, 0.1, 1)
)
set.seed(103)
nnetFit <- train(churn ~ ., data = churnTest,
method = "nnet",
preProcess = preProc,
tuneGrid = nnetGrid,
trControl = ctrl,
metric = "ROC",
MaxNWts = 2000,
trace = FALSE)
#MDA
mdaGrid <- data.frame(subclasses = 1:4)
set.seed(104)
mdaFit <- train(churn ~ ., data = churnTest,
method = "mda",
preProcess = preProc,
tuneGrid = mdaGrid,
trControl = ctrl,
metric = "ROC")
resamps <- resamples(list(SVM = svmFit, KNN = knnFit, FDA = fdaFit, NNET = nnetFit, MDA = mdaFit))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: SVM, KNN, FDA, NNET, MDA
## Number of resamples: 10
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.8909657 0.9007697 0.9195223 0.9196074 0.9256998 0.9662309 0
## KNN 0.7748763 0.8127221 0.8597291 0.8521622 0.8836238 0.9439252 0
## FDA 0.8199588 0.8588792 0.8771456 0.8701683 0.8842783 0.8956386 0
## NNET 0.8654684 0.9083811 0.9235844 0.9230897 0.9451698 0.9662513 0
## MDA 0.7145062 0.8014235 0.8720145 0.8389483 0.8903702 0.9288681 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.3529412 0.50000000 0.54248366 0.55718954 0.6617647 0.7647059 0
## KNN 0.0000000 0.05555556 0.08333333 0.09705882 0.1176471 0.2352941 0
## FDA 0.2941176 0.40277778 0.45751634 0.47679739 0.5294118 0.7222222 0
## NNET 0.2941176 0.39460784 0.47222222 0.45980392 0.5220588 0.6111111 0
## MDA 0.1666667 0.24591503 0.39705882 0.39084967 0.5416667 0.6470588 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.9626168 0.9720275 0.9767653 0.9767220 0.9813084 0.9906542 0
## KNN 0.9813084 0.9930556 1.0000000 0.9962703 1.0000000 1.0000000 0
## FDA 0.9065421 0.9160826 0.9392523 0.9394600 0.9535955 1.0000000 0
## NNET 0.9626168 0.9627034 0.9860678 0.9794998 0.9906542 1.0000000 0
## MDA 0.9158879 0.9347309 0.9485981 0.9487106 0.9628764 0.9814815 0
dotplot(resamps, metric = "ROC")
nnetPred <- predict(nnetFit, churnTest)
nnetProb <- predict(nnetFit, churnTest, type = "prob")
cat("\n-------- NNET --------\n")
##
## -------- NNET --------
confusionMatrix(nnetPred, churnTest$churn, positive = "yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 113 8
## no 63 1065
##
## Accuracy : 0.9432
## 95% CI : (0.9288, 0.9553)
## No Information Rate : 0.8591
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7299
##
## Mcnemar's Test P-Value : 1.468e-10
##
## Sensitivity : 0.64205
## Specificity : 0.99254
## Pos Pred Value : 0.93388
## Neg Pred Value : 0.94415
## Prevalence : 0.14091
## Detection Rate : 0.09047
## Detection Prevalence : 0.09688
## Balanced Accuracy : 0.81729
##
## 'Positive' Class : yes
##
rocObj <- roc(churnTest$churn, nnetProb$yes)
plot(rocObj, main = paste("AUC =", round(auc(rocObj), 3)))
Response: Based on the ROC reuslts above, NNET is the best overall model with an average ROC value of 0.923, followed by the SVM model. On the test set, NNET achieved an accuracy of 0.943 and an AUC of 0.961 indicating strong overall classification performance. Depsite NNEt achieving very high specificity, its sensitivity is moderate at 0.64, demonstrating that a substantial portion of churners are still being mislassified.This is likely due to class imbalance favoring non churners.
Use the fatty acid data from Homework 1 Problem 3 above.
data(oil)
str(oilType)
## Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
table(oilType)
## oilType
## A B C D E F G
## 37 26 3 7 11 10 2
# Stratified Random Sampling
fullData <- data.frame(fattyAcids, oilType = oilType)
trainIndex <- createDataPartition(fullData$oilType, p = 0.7, list = FALSE)
trainData <- fullData[trainIndex, ]
testData <- fullData[-trainIndex, ]
# Resampling
ctrl <- trainControl (
method = "repeatedcv",
number = 5,
repeats = 3,
sampling = "up",
classProbs = TRUE,
summaryFunction = defaultSummary
)
# ----------- CART -----------
set.seed(100)
cart_fit <- train(
oilType ~ .,
data = trainData,
method = "rpart",
metric = "Kappa",
tuneLength = 20,
trControl = ctrl
)
cart_fit
## CART
##
## 70 samples
## 7 predictor
## 7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 58, 55, 57, 56, 54, 56, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.00000000 0.9064608 0.8769803
## 0.02272727 0.9064608 0.8769803
## 0.04545455 0.9064608 0.8769803
## 0.06818182 0.9064608 0.8769803
## 0.09090909 0.9064608 0.8769803
## 0.11363636 0.9064608 0.8769803
## 0.13636364 0.9064608 0.8769803
## 0.15909091 0.9064608 0.8769803
## 0.18181818 0.3742927 0.0000000
## 0.20454545 0.3742927 0.0000000
## 0.22727273 0.3742927 0.0000000
## 0.25000000 0.3742927 0.0000000
## 0.27272727 0.3742927 0.0000000
## 0.29545455 0.3742927 0.0000000
## 0.31818182 0.3742927 0.0000000
## 0.34090909 0.3742927 0.0000000
## 0.36363636 0.3742927 0.0000000
## 0.38636364 0.3742927 0.0000000
## 0.40909091 0.3742927 0.0000000
## 0.43181818 0.3742927 0.0000000
##
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.1590909.
cart_fit$bestTune
## cp
## 8 0.1590909
plot(cart_fit)
rpart.plot(
cart_fit$finalModel,
extra = 104,
box.palette = "Blues",
main = "CART tree — oil type"
)
# ----------- C5.0 -----------
set.seed(100)
c50_tree_fit <- train(
oilType ~ .,
data = trainData,
method = "C5.0Tree",
metric = "Kappa",
trControl = ctrl
)
c50_tree_fit
## Single C5.0 Tree
##
## 70 samples
## 7 predictor
## 7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 58, 55, 57, 56, 54, 56, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## Accuracy Kappa
## 0.9287729 0.9063092
cart_pred <- predict(cart_fit, testData)
c50_tree_pred <- predict(c50_tree_fit, testData)
cat("---- CART ----\n")
## ---- CART ----
confusionMatrix(cart_pred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 10 0 0 0 2 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 1 0 0 0 1 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.8846
## 95% CI : (0.6985, 0.9755)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.4e-06
##
## Kappa : 0.8361
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 0.9091 1.0000 NA 1.00000 0.33333 1.0000
## Specificity 0.8667 1.0000 1 1.00000 0.95652 1.0000
## Pos Pred Value 0.8333 1.0000 NA 1.00000 0.50000 1.0000
## Neg Pred Value 0.9286 1.0000 NA 1.00000 0.91667 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.11538 0.1154
## Detection Rate 0.3846 0.2692 0 0.07692 0.03846 0.1154
## Detection Prevalence 0.4615 0.2692 0 0.07692 0.07692 0.1154
## Balanced Accuracy 0.8879 1.0000 NA 1.00000 0.64493 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("---- C5.0 single tree ----\n")
## ---- C5.0 single tree ----
confusionMatrix(c50_tree_pred, testData$oilType)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 9 0 0 0 0 0 0
## B 1 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 1 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9231
## 95% CI : (0.7487, 0.9905)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.241e-07
##
## Kappa : 0.8952
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 0.8182 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 0.9474 1 1.00000 0.9565 1.0000
## Pos Pred Value 1.0000 0.8750 NA 1.00000 0.7500 1.0000
## Neg Pred Value 0.8824 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.3462 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.3462 0.3077 0 0.07692 0.1538 0.1154
## Balanced Accuracy 0.9091 0.9737 NA 1.00000 0.9783 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
Response: C5.0 single tree had a CV kappa of 0.915 and a test kappa of 0.947. This model slightly outperforms CART in CV, though CART did perfectly classify the test split. Classes C and G were absent from the testing set due to class imbalance.
# ----------- Bagged Tree -----------
set.seed(100)
bag_fit <- train(
oilType ~ .,
data = trainData,
method = "treebag",
metric = "Kappa",
trControl = ctrl
)
bag_fit
## Bagged CART
##
## 70 samples
## 7 predictor
## 7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 58, 55, 57, 56, 54, 56, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## Accuracy Kappa
## 0.9863095 0.9823518
# ----------- Random Forest -----------
set.seed(100)
rf_fit <- train(
oilType ~ .,
data = trainData,
method = "rf",
metric = "Kappa",
tuneGrid = data.frame(mtry = 2:7),
trControl = ctrl,
ntree = 500
)
rf_fit
## Random Forest
##
## 70 samples
## 7 predictor
## 7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 58, 55, 57, 56, 54, 56, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 1.0000000 1.0000000
## 3 1.0000000 1.0000000
## 4 0.9952381 0.9935185
## 5 0.9952381 0.9935185
## 6 1.0000000 1.0000000
## 7 1.0000000 1.0000000
##
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# ----------- C5.0 with Boosting -----------
set.seed(100)
c50_boost_fit <- train(
oilType ~ .,
data = trainData,
method = "C5.0",
metric = "Kappa",
tuneGrid = expand.grid(
trials = c(1, 5, 10, 20, 50, 100),
model = "tree",
winnow = FALSE
),
trControl = ctrl
)
c50_boost_fit
## C5.0
##
## 70 samples
## 7 predictor
## 7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 58, 55, 57, 56, 54, 56, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## trials Accuracy Kappa
## 1 0.9287729 0.9063092
## 5 0.9287729 0.9063092
## 10 0.9287729 0.9063092
## 20 0.9287729 0.9063092
## 50 0.9287729 0.9063092
## 100 0.9287729 0.9063092
##
## Tuning parameter 'model' was held constant at a value of tree
## Tuning
## parameter 'winnow' was held constant at a value of FALSE
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were trials = 1, model = tree and winnow
## = FALSE.
c50_boost_fit$bestTune
## trials model winnow
## 1 1 tree FALSE
plot(c50_boost_fit)
# ----------- Boosted Tree -----------
set.seed(100)
gbm_fit <- train(
oilType ~ .,
data = trainData,
method = "gbm",
metric = "Kappa",
tuneGrid = expand.grid(
n.trees = c(50, 100, 200, 500),
interaction.depth = c(1, 2, 3),
shrinkage = c(0.01, 0.05, 0.1),
n.minobsinnode = 10
),
trControl = ctrl,
verbose = FALSE
)
gbm_fit
## Stochastic Gradient Boosting
##
## 70 samples
## 7 predictor
## 7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 58, 55, 57, 56, 54, 56, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees Accuracy Kappa
## 0.01 1 50 0.9421354 0.9234256
## 0.01 1 100 0.9476909 0.9314819
## 0.01 1 200 0.9476909 0.9315755
## 0.01 1 500 0.9528191 0.9377925
## 0.01 2 50 0.9544353 0.9403189
## 0.01 2 100 0.9544353 0.9401765
## 0.01 2 200 0.9588797 0.9463070
## 0.01 2 500 0.9588797 0.9462206
## 0.01 3 50 0.9502686 0.9343659
## 0.01 3 100 0.9588797 0.9461645
## 0.01 3 200 0.9588797 0.9461645
## 0.01 3 500 0.9630464 0.9514296
## 0.05 1 50 0.9378008 0.9190679
## 0.05 1 100 0.9381183 0.9195327
## 0.05 1 200 0.9476909 0.9309692
## 0.05 1 500 0.9243179 0.9015436
## 0.05 2 50 0.9537515 0.9393412
## 0.05 2 100 0.9588797 0.9460322
## 0.05 2 200 0.9586020 0.9453959
## 0.05 2 500 0.9483455 0.9324272
## 0.05 3 50 0.9588797 0.9461645
## 0.05 3 100 0.9630464 0.9516167
## 0.05 3 200 0.9588797 0.9460711
## 0.05 3 500 0.9574908 0.9431510
## 0.10 1 50 0.9537515 0.9392089
## 0.10 1 100 0.9476909 0.9309692
## 0.10 1 200 0.9376909 0.9179208
## 0.10 1 500 0.9393071 0.9202262
## 0.10 2 50 0.9569858 0.9433627
## 0.10 2 100 0.9528191 0.9377307
## 0.10 2 200 0.9569858 0.9430576
## 0.10 2 500 0.9483747 0.9314694
## 0.10 3 50 0.9588797 0.9458594
## 0.10 3 100 0.9588797 0.9459601
## 0.10 3 200 0.9480084 0.9309101
## 0.10 3 500 0.9588309 0.9448694
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 100, interaction.depth =
## 3, shrinkage = 0.05 and n.minobsinnode = 10.
gbm_fit$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 22 100 3 0.05 10
plot(gbm_fit)
bag_pred <- predict(bag_fit, testData)
rf_pred <- predict(rf_fit, testData)
c50_boost_pred <- predict(c50_boost_fit, testData)
gbm_pred <- predict(gbm_fit, testData)
cat("---- Bagging ----\n"); confusionMatrix(bag_pred, testData$oilType)
## ---- Bagging ----
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("---- Random Forest ----\n"); confusionMatrix(rf_pred, testData$oilType)
## ---- Random Forest ----
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("---- C5.0 Boosted ----\n"); confusionMatrix(c50_boost_pred, testData$oilType)
## ---- C5.0 Boosted ----
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 9 0 0 0 0 0 0
## B 1 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 1 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.9231
## 95% CI : (0.7487, 0.9905)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.241e-07
##
## Kappa : 0.8952
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 0.8182 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 0.9474 1 1.00000 0.9565 1.0000
## Pos Pred Value 1.0000 0.8750 NA 1.00000 0.7500 1.0000
## Neg Pred Value 0.8824 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.3462 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.3462 0.3077 0 0.07692 0.1538 0.1154
## Balanced Accuracy 0.9091 0.9737 NA 1.00000 0.9783 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
cat("---- GBM ----\n"); confusionMatrix(gbm_pred, testData$oilType)
## ---- GBM ----
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E F G
## A 11 0 0 0 0 0 0
## B 0 7 0 0 0 0 0
## C 0 0 0 0 0 0 0
## D 0 0 0 2 0 0 0
## E 0 0 0 0 3 0 0
## F 0 0 0 0 0 3 0
## G 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.8677, 1)
## No Information Rate : 0.4231
## P-Value [Acc > NIR] : 1.936e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Specificity 1.0000 1.0000 1 1.00000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Rate 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Detection Prevalence 0.4231 0.2692 0 0.07692 0.1154 0.1154
## Balanced Accuracy 1.0000 1.0000 NA 1.00000 1.0000 1.0000
## Class: G
## Sensitivity NA
## Specificity 1
## Pos Pred Value NA
## Neg Pred Value NA
## Prevalence 0
## Detection Rate 0
## Detection Prevalence 0
## Balanced Accuracy NA
Response: Both the bagged tree model and the RF model achieved perfect classification on the test set, matching CART’s test performance. Additionally, the both receive a higher CV Kappa which does show that bagging does improve over a singal CART tree. The C5.0 boosted model performed worse on the test set than CART and bagging. GBM however, achieved perfect test classification at Kappa = 1.0, greater than that of the CV Kappa CART model which shows that bagging does generally improve performance as well, except in the case of C5.0 which shows that additional boosting rounds in that case did not find any meaningful residual patterns to correct.
resamps <- resamples(list(
CART = cart_fit,
C50_Tree = c50_tree_fit,
Bagging = bag_fit,
RandomForest = rf_fit,
C50_Boost = c50_boost_fit,
GBM = gbm_fit
))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: CART, C50_Tree, Bagging, RandomForest, C50_Boost, GBM
## Number of resamples: 15
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.7500000 0.8571429 0.9090909 0.9064608 0.9666667 1 0
## C50_Tree 0.7692308 0.8958333 0.9285714 0.9287729 1.0000000 1 0
## Bagging 0.9285714 1.0000000 1.0000000 0.9863095 1.0000000 1 0
## RandomForest 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1 0
## C50_Boost 0.7692308 0.8958333 0.9285714 0.9287729 1.0000000 1 0
## GBM 0.9230769 0.9285714 0.9375000 0.9630464 1.0000000 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.6952381 0.8126566 0.8750000 0.8769803 0.9568966 1 0
## C50_Tree 0.7111111 0.8607921 0.9066667 0.9063092 1.0000000 1 0
## Bagging 0.9060403 1.0000000 1.0000000 0.9823518 1.0000000 1 0
## RandomForest 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1 0
## C50_Boost 0.7111111 0.8607921 0.9066667 0.9063092 1.0000000 1 0
## GBM 0.8992248 0.9057228 0.9215686 0.9516167 1.0000000 1 0
dotplot(resamps, metric = "Kappa")
resamps$values %>%
tidyr::pivot_longer(everything(),
names_to = c("Model", ".value"),
names_sep = "~") %>%
group_by(Model) %>%
summarise(Mean_Kappa = round(mean(Kappa), 4),
SD_Kappa = round(sd(Kappa), 4)) %>%
arrange(desc(Mean_Kappa))
## # A tibble: 7 × 3
## Model Mean_Kappa SD_Kappa
## <chr> <dbl> <dbl>
## 1 RandomForest 1 0
## 2 Bagging 0.982 0.0367
## 3 GBM 0.952 0.0472
## 4 C50_Boost 0.906 0.0859
## 5 C50_Tree 0.906 0.0859
## 6 CART 0.877 0.0925
## 7 Resample NA NA
Response: Random Forest is the best performing model, achieving the highest mean CV kappa of 0.9879 and the lowest SD. All models achieved perfect classification on the test set except C5.0 single and boosted. Because of the absence of C and G classes from the test set, CV Kapp is a better basis for comparison. An interesting pattern emerges showing that SD of Kappa decreases as model complexity increases, replicating the bias-variance tradeoff discussed in the previous chapters.