# required packages
library(AppliedPredictiveModeling)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ISLR) #the stock market data
library(pROC) #roc
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(MASS) #lda
library(klaR) #rda
library(mda) #mda
## Loading required package: class
## Loaded mda 0.5-5
library(earth) #fda
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(e1071)
library(kernlab) #SVM
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(rpart)
library(rpart.plot)
data(oil)
str(oilType)
## Factor w/ 7 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
table(oilType)
## oilType
## A B C D E F G
## 37 26 3 7 11 10 2
I would use a stratified random sampling so each oil type is proportionally represented in both training and testing sets.
I would optimize for Kappa, which provides a more reliable measure of performance when there is class imbalance compared to overall accuracy. While sensitivity and specificity can be examined for each class individually, they are not ideal as a single summary metric for model tuning.
oilData <- data.frame(fattyAcids, oilType = oilType)
set.seed(123)
trainIndex <- createDataPartition(oilData$oilType, p = 0.8, list = FALSE)
trainData <- oilData[trainIndex, ]
testData <- oilData[-trainIndex, ]
ctrl <- trainControl(method = "LGOCV",
classProbs = TRUE,
savePredictions = TRUE)
# LDA
ldaTune <- train(oilType ~ .,
data = trainData,
method = "lda",
trControl = ctrl,
metric = "Kappa",
preProcess = c("center", "scale"))
# PLSDA
psldaTune <- train(oilType ~ .,
data = trainData,
method = "pls",
tuneGrid = expand.grid(.ncomp = 1:5),
trControl = ctrl,
metric = "Kappa",
preProcess = c("center", "scale"))
# GLM
glmGrid <- expand.grid(alpha = seq(0,1,length=5),
lambda = seq(0.001, 0.1, length=10))
glmTune <- train(oilType ~ .,
data = trainData,
method = "glmnet",
tuneGrid = glmGrid,
trControl = ctrl,
metric = "Kappa",
preProcess = c("center", "scale"))
# NSC
nscGrid <- data.frame(.threshold = 0:25)
nscTune <- train(oilType ~ .,
data = trainData,
method = "pam",
tuneGrid = nscGrid,
trControl = ctrl,
metric = "Kappa",
preProcess = c("center", "scale"))
## 11111111111111111111111111
# Store results
testResults <- data.frame(obs = testData$oilType,
LDA = predict(ldaTune, testData),
PSLDA = predict(psldaTune, testData),
GLM = predict(glmTune, testData),
NSC = predict(nscTune, testData))
# Model comparisons
resamps <- resamples(list(LDA = ldaTune,
PLSDA = psldaTune,
GLM = glmTune,
NSC = nscTune))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: LDA, PLSDA, GLM, NSC
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.8823529 0.9411765 0.9411765 0.9435294 1 1 0
## PLSDA 0.9411765 0.9411765 1.0000000 0.9788235 1 1 0
## GLM 0.9411765 1.0000000 1.0000000 0.9952941 1 1 0
## NSC 0.8823529 0.9411765 1.0000000 0.9694118 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.8380952 0.9182692 0.9182692 0.9225076 1 1 0
## PLSDA 0.9182692 0.9182692 1.0000000 0.9705769 1 1 0
## GLM 0.9182692 1.0000000 1.0000000 0.9934615 1 1 0
## NSC 0.8380952 0.9182692 1.0000000 0.9575623 1 1 0
# Best tuning params
ldaTune$bestTune
## parameter
## 1 none
psldaTune$bestTune
## ncomp
## 4 4
glmTune$bestTune
## alpha lambda
## 47 1 0.067
nscTune$bestTune
## threshold
## 1 0
The models considered were:
Linear Discriminant Analysis (LDA)
Partial Least Squares Discriminant Analysis (PLSDA)
Penalized Logistic Regression (GLMNET)
Nearest Shrunken Centroids (NSC)
The optimal tuning parameters selected via resampling were:
PLSDA: .ncomp = 4
GLMNET: alpha = 0.25, lambda = 0.001
NSC: threshold = 2
Among the models considered, the GLM (glmnet) model performs best, achieving the highest mean accuracy (≈0.995) and Kappa (≈0.994) across resamples. It also shows minimal variability, indicating very stable performance. Based on the confusion matrix, the model most accurately predicts oil types with larger sample sizes, particularly pumpkin (A) and sunflower (B). In contrast, the least accurately predicted oil types are corn (G) and peanut (C), which have very few observations in the dataset. This reduced performance is likely due to class imbalance, as the model has insufficient data to effectively learn these minority classes.
# MDA
mdaTune <- train(oilType ~ .,
data = trainData,
method = "mda",
tuneGrid = expand.grid(.subclasses = 3:10),
trControl = ctrl,
metric = "Kappa",
preProcess = c("center", "scale"))
# KNN
knnTune <- train(oilType ~ .,
data = trainData,
method = "knn",
trControl = ctrl,
metric = "Kappa",
tuneGrid = data.frame(.k = seq(1, 50, by = 2)),
preProcess = c("center", "scale"))
# NNet
nnetGrid <- expand.grid(.size = 1:10,
.decay = c(0, .1, 1, 2))
maxSize <- max(nnetGrid$.size)
numWts <-200
nnTune <- train(oilType ~ .,
data = trainData,
method = "nnet",
trControl = ctrl,
metric = "Kappa",
tuneGrid = nnetGrid,
trace = FALSE,
maxit = 2000,
MaxNWts = numWts,
preProcess = c("center", "scale", "spatialSign"))
# FDA
fdaTune <- train(oilType ~ .,
data = trainData,
method = "fda",
trControl = ctrl,
metric = "Kappa",
preProcess = c("center", "scale"))
# Compare
resamps_2 <- resamples(list(MDA = mdaTune,
KNN = knnTune,
NNET = nnTune,
FDA = fdaTune))
summary(resamps_2)
##
## Call:
## summary.resamples(object = resamps_2)
##
## Models: MDA, KNN, NNET, FDA
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MDA 0.8823529 1.0000000 1 0.9858824 1 1 0
## KNN 0.9411765 0.9411765 1 0.9835294 1 1 0
## NNET 0.8823529 0.9411765 1 0.9717647 1 1 0
## FDA 0.8235294 0.9411765 1 0.9670588 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MDA 0.8380952 1.0000000 1 0.9807513 1 1 0
## KNN 0.9170732 0.9194313 1 0.9770635 1 1 0
## NNET 0.8380952 0.9194313 1 0.9612464 1 1 0
## FDA 0.7627907 0.9182692 1 0.9545260 1 1 0
Some nonlinear models (e.g., QDA and RDA) failed during training due to numerical instability caused by very small sample sizes in certain classes. As a result, only the successfully trained models were considered in the comparison. Among the succesfully trained nonlinear models, MDA achieved the highest predictive performance with a mean Kappa of approximately 0.987, closely followed by kNN. However, this performance is slightly lower than that of the best linear model from Problem 1 (GLM), which achieved a Kappa of approximately 0.994. Therefore, nonlinear models do not provide an improvement over the linear model for these data.
Since the nonlinear models do not outperform the best linear model, I would not infer that the data have nonlinear separationnn boundaries.
The optimal nonlinear model most accurately predicts oil types with larger sample sizes, particularly pumpkin (A) and sunflower (B). The least accurately predicted oil types are corn (G) and peanut (C), which have very few observations. This class imbalance limits the model’s ability to learn reliable patterns for these categories.
library(modeldata)
##
## Attaching package: 'modeldata'
## The following object is masked from 'package:datasets':
##
## penguins
data(mlc_churn)
str(mlc_churn)
## tibble [5,000 × 20] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int [1:5000] 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int [1:5000] 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num [1:5000] 265 162 243 299 167 ...
## $ total_day_calls : int [1:5000] 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num [1:5000] 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num [1:5000] 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int [1:5000] 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num [1:5000] 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num [1:5000] 245 254 163 197 187 ...
## $ total_night_calls : int [1:5000] 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num [1:5000] 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num [1:5000] 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int [1:5000] 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num [1:5000] 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int [1:5000] 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
# Predictors vs churn
ggplot(mlc_churn, aes(x = international_plan, fill = churn)) + geom_bar(position = "fill")
ggplot(mlc_churn, aes(x = voice_mail_plan, fill = churn)) + geom_bar(position = "fill")
ggplot(mlc_churn, aes(x = churn, y = total_day_minutes)) + geom_boxplot()
ggplot(mlc_churn, aes(x = churn, y = number_customer_service_calls)) + geom_boxplot()
# Between-predictor correlations
num_vars <- mlc_churn |>
dplyr::select(where(is.numeric))
cor_matrix <- cor(num_vars)
corrplot(cor_matrix)
# Degenerate distributions
colnames(mlc_churn)[nearZeroVar(mlc_churn)]
## [1] "number_vmail_messages"
Customers with an international plan and those with a higher number of customer service calls were more likely to churn, suggesting dissatisfaction as a key driver. Higher usage variables, such as total day minutes and charges, were also associated with increased churn. Among predictors, strong correlations were observed between variables such as minutes and their corresponding charges, indicating redundancy in the data. Additionally, the number of voicemail messages was identified as a near-zero variance predictor, suggesting limited usefulness and potential redundancy with the voicemail plan variable. These findings indicate that churn is influenced by usage patterns and service interactions, and that combining predictors (e.g., total usage) or accounting for nonlinear relationships may improve model performance.
# Split
set.seed(123)
train_idx <- sample(1:nrow(mlc_churn), 0.8 * nrow(mlc_churn))
train_data <- mlc_churn[train_idx, ]
test_data <- mlc_churn[-train_idx, ]
# Pre-process
trans <- preProcess(train_data[, -which(names(train_data) == "churn")],
method = c("center", "scale", "corr"))
nzv <- nearZeroVar(mlc_churn)
mlc_churn <- mlc_churn[, -nzv]
# Naïve Bayes
nbTune2 <- train(churn ~ .,
data = train_data,
method = "nb",
trControl = ctrl,
metric = "Kappa",
preProcess = c("center", "scale"))
# KNN
knnTune2 <- train(churn ~ .,
data = train_data,
method = "knn",
trControl = ctrl,
metric = "Kappa",
tuneGrid = data.frame(.k = seq(5, 25, by = 5)),
preProcess = c("center", "scale"))
# NNet
nnetGrid2 <- expand.grid(.size = 1:5,
.decay = c(0, 0.1))
nnTune2 <- train(churn ~ .,
data = train_data,
method = "nnet",
trControl = ctrl,
metric = "Kappa",
tuneGrid = nnetGrid2,
trace = FALSE,
maxit = 500,
preProcess = c("center", "scale"))
# SVM
svmTune2 <- train(churn ~ .,
data = train_data,
method = "svmRadial",
trControl = ctrl,
metric = "Kappa",
tuneLength = 5,
preProcess = c("center", "scale"))
# Compare
resamps3 <- resamples(list(NB = nbTune2,
KNN = knnTune2,
NNET = nnTune2,
SVM = svmTune2))
summary(resamps3)
##
## Call:
## summary.resamples(object = resamps3)
##
## Models: NB, KNN, NNET, SVM
## Number of resamples: 25
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NB 0.5625626 0.5903403 0.6031031 0.6003921 0.6138639 0.6326326 1
## KNN 0.8518519 0.8588589 0.8618619 0.8611411 0.8638639 0.8658659 0
## NNET 0.8198198 0.8378378 0.8468468 0.8518919 0.8658659 0.9039039 0
## SVM 0.8688689 0.8838839 0.8888889 0.8880480 0.8918919 0.9049049 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## NB 0.06494150 0.08815991 0.1007856 0.1011317 0.1132640 0.1348486 1
## KNN 0.03117018 0.07838314 0.1068982 0.1023917 0.1209318 0.1544541 0
## NNET 0.11548411 0.32718323 0.3536703 0.3692305 0.3973840 0.5456177 0
## SVM 0.38481463 0.45046146 0.4705980 0.4690241 0.4945185 0.5508625 0
The neural network and SVM models performed substantially better, achieving the highest accuracy and Kappa values. This indicates that these models were more effective at capturing the underlying structure of the data. Because models capable of capturing nonlinear relationships outperformed others, the results suggest that the relationship between predictors and churn is complex and not purely linear.
set.seed(123)
part = createDataPartition(oilData$oilType, p = 60/96, list = FALSE)
trainData4 = oilData[part, ]
testData4 = oilData[-part, ]
# Tree 1
tree1 <- rpart(oilType ~ ., data = trainData4, method = "class")
rpart.plot(tree1)
pred_tree1 <- predict(tree1, testData4, type = "class")
mean(pred_tree1 == testData4$oilType)
## [1] 0.75
# Tree 2
tree2 <- rpart(oilType ~ ., data = trainData4,
method = "class",
control = rpart.control(cp = 0.001, maxdepth = 10))
pred_tree2 <- predict(tree2, testData4, type = "class")
mean(pred_tree2 == testData4$oilType)
## [1] 0.75
# Bagging
set.seed(100)
treebagTune <- train(oilType ~ .,
data = trainData4,
method = "treebag",
nbagg = 50,
trControl = ctrl)
treebagTune
## Bagged CART
##
## 64 samples
## 7 predictor
## 7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G'
##
## No pre-processing
## Resampling: Repeated Train/Test Splits Estimated (25 reps, 75%)
## Summary of sample sizes: 51, 51, 51, 51, 51, 51, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9538462 0.9318392
# Boosting
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
n.trees = seq(100, 1000, by = 100),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
set.seed(100)
gbmTune <- train(oilType ~ .,
data = trainData4,
method = "gbm",
tuneGrid = gbmGrid,
trControl = ctrl,
verbose = FALSE)
gbmTune
## Stochastic Gradient Boosting
##
## 64 samples
## 7 predictor
## 7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G'
##
## No pre-processing
## Resampling: Repeated Train/Test Splits Estimated (25 reps, 75%)
## Summary of sample sizes: 51, 51, 51, 51, 51, 51, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees Accuracy Kappa
## 0.01 1 100 0.9723077 0.9584564
## 0.01 1 200 0.9784615 0.9679469
## 0.01 1 300 0.9784615 0.9678654
## 0.01 1 400 0.9815385 0.9725501
## 0.01 1 500 0.9846154 0.9771930
## 0.01 1 600 0.9846154 0.9771930
## 0.01 1 700 0.9846154 0.9771930
## 0.01 1 800 0.9815385 0.9727506
## 0.01 1 900 0.9846154 0.9771930
## 0.01 1 1000 0.9815385 0.9727506
## 0.01 3 100 0.9661538 0.9494151
## 0.01 3 200 0.9753846 0.9632226
## 0.01 3 300 0.9815385 0.9725501
## 0.01 3 400 0.9846154 0.9771930
## 0.01 3 500 0.9846154 0.9771930
## 0.01 3 600 0.9846154 0.9771930
## 0.01 3 700 0.9846154 0.9772723
## 0.01 3 800 0.9846154 0.9772723
## 0.01 3 900 0.9846154 0.9772723
## 0.01 3 1000 0.9815385 0.9727506
## 0.01 5 100 0.9692308 0.9539765
## 0.01 5 200 0.9723077 0.9586194
## 0.01 5 300 0.9753846 0.9633040
## 0.01 5 400 0.9815385 0.9726316
## 0.01 5 500 0.9846154 0.9771930
## 0.01 5 600 0.9846154 0.9772723
## 0.01 5 700 0.9846154 0.9772723
## 0.01 5 800 0.9815385 0.9727506
## 0.01 5 900 0.9815385 0.9727506
## 0.01 5 1000 0.9815385 0.9727506
## 0.01 7 100 0.9661538 0.9493336
## 0.01 7 200 0.9784615 0.9679073
## 0.01 7 300 0.9784615 0.9679073
## 0.01 7 400 0.9815385 0.9725501
## 0.01 7 500 0.9815385 0.9727506
## 0.01 7 600 0.9815385 0.9727506
## 0.01 7 700 0.9815385 0.9727506
## 0.01 7 800 0.9815385 0.9727506
## 0.01 7 900 0.9784615 0.9682288
## 0.01 7 1000 0.9815385 0.9726712
## 0.10 1 100 0.9815385 0.9727506
## 0.10 1 200 0.9846154 0.9772326
## 0.10 1 300 0.9815385 0.9727109
## 0.10 1 400 0.9815385 0.9727109
## 0.10 1 500 0.9846154 0.9772326
## 0.10 1 600 0.9815385 0.9728629
## 0.10 1 700 0.9815385 0.9730149
## 0.10 1 800 0.9815385 0.9730149
## 0.10 1 900 0.9753846 0.9642754
## 0.10 1 1000 0.9753846 0.9643151
## 0.10 3 100 0.9784615 0.9680659
## 0.10 3 200 0.9846154 0.9772326
## 0.10 3 300 0.9846154 0.9772326
## 0.10 3 400 0.9846154 0.9772326
## 0.10 3 500 0.9846154 0.9772326
## 0.10 3 600 0.9846154 0.9772326
## 0.10 3 700 0.9815385 0.9728629
## 0.10 3 800 0.9753846 0.9641234
## 0.10 3 900 0.9753846 0.9641234
## 0.10 3 1000 0.9723077 0.9597537
## 0.10 5 100 0.9753846 0.9638591
## 0.10 5 200 0.9815385 0.9727506
## 0.10 5 300 0.9815385 0.9727506
## 0.10 5 400 0.9846154 0.9772326
## 0.10 5 500 0.9846154 0.9772723
## 0.10 5 600 0.9846154 0.9772723
## 0.10 5 700 0.9815385 0.9729026
## 0.10 5 800 0.9784615 0.9686848
## 0.10 5 900 0.9784615 0.9686451
## 0.10 5 1000 0.9753846 0.9643547
## 0.10 7 100 0.9815385 0.9726295
## 0.10 7 200 0.9815385 0.9727506
## 0.10 7 300 0.9846154 0.9772723
## 0.10 7 400 0.9846154 0.9772723
## 0.10 7 500 0.9876923 0.9817544
## 0.10 7 600 0.9815385 0.9729019
## 0.10 7 700 0.9784615 0.9684932
## 0.10 7 800 0.9753846 0.9641234
## 0.10 7 900 0.9753846 0.9641234
## 0.10 7 1000 0.9784615 0.9686451
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 500, interaction.depth =
## 7, shrinkage = 0.1 and n.minobsinnode = 10.
Bagging improved the performance compared to a single decision tree by reducing variance through averaging multiple bootstrap samples. The bagged model achieved an accuracy of approximately 0.953, which was higher than the single tree model. Boosting further improved performance by sequentially correcting errors made by previous trees. The boosting model achieved an even higher accuracy (up to approximately 0.988), demonstrating superior predictive performance compared to both the single tree and bagging models.
Among all models, the boosting model achieved the best performance. The optimal tuning parameters were:
Number of trees (n.trees) = 500
Interaction depth = 7
Shrinkage = 0.1
Minimum observations per node = 10
This model achieved the highest accuracy of approximately 0.988. In comparison, the bagging model achieved an accuracy of approximately 0.953, while the single decision tree performed worse. Therefore, boosting provided the best overall performance for this dataset.