Problem1

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(oil)

table(oilType)
## oilType
##  A  B  C  D  E  F  G 
## 37 26  3  7 11 10  2
prop.table(table(oilType))
## oilType
##          A          B          C          D          E          F          G 
## 0.38541667 0.27083333 0.03125000 0.07291667 0.11458333 0.10416667 0.02083333

1(a)

oil_dat <- data.frame(oilType = oilType, fattyAcids)

set.seed(123)

test_idx <- unlist(lapply(split(seq_len(nrow(oil_dat)), oil_dat$oilType), function(idx) {
  sample(idx, size = max(1, floor(0.20 * length(idx))))
}))

train_dat <- oil_dat[-test_idx, ]
test_dat  <- oil_dat[test_idx, ]

table(train_dat$oilType)
## 
##  A  B  C  D  E  F  G 
## 30 21  2  6  9  8  1
table(test_dat$oilType)
## 
## A B C D E F G 
## 7 5 1 1 2 2 1

Because the response variable is highly imbalanced, I created the training and testing sets using a stratified approach based on oil type so that each class would still be represented in the test set. This was especially important because some classes, such as C and G, had very few observations. A simple random split could easily place all observations from a minority class into one set. Therefore, I used an approximately 80/20 split while ensuring that every oil type appeared in the test set.

1(b)

ctrl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 5
)

ctrl
## $method
## [1] "repeatedcv"
## 
## $number
## [1] 10
## 
## $repeats
## [1] 5
## 
## $search
## [1] "grid"
## 
## $p
## [1] 0.75
## 
## $initialWindow
## NULL
## 
## $horizon
## [1] 1
## 
## $fixedWindow
## [1] TRUE
## 
## $skip
## [1] 0
## 
## $verboseIter
## [1] FALSE
## 
## $returnData
## [1] TRUE
## 
## $returnResamp
## [1] "final"
## 
## $savePredictions
## [1] FALSE
## 
## $classProbs
## [1] FALSE
## 
## $summaryFunction
## function (data, lev = NULL, model = NULL) 
## {
##     if (is.character(data$obs)) 
##         data$obs <- factor(data$obs, levels = lev)
##     postResample(data[, "pred"], data[, "obs"])
## }
## <bytecode: 0x1400e2588>
## <environment: namespace:caret>
## 
## $selectionFunction
## [1] "best"
## 
## $preProcOptions
## $preProcOptions$thresh
## [1] 0.95
## 
## $preProcOptions$ICAcomp
## [1] 3
## 
## $preProcOptions$k
## [1] 5
## 
## $preProcOptions$freqCut
## [1] 19
## 
## $preProcOptions$uniqueCut
## [1] 10
## 
## $preProcOptions$cutoff
## [1] 0.9
## 
## 
## $sampling
## NULL
## 
## $index
## NULL
## 
## $indexOut
## NULL
## 
## $indexFinal
## NULL
## 
## $timingSamps
## [1] 0
## 
## $predictionBounds
## [1] FALSE FALSE
## 
## $seeds
## [1] NA
## 
## $adaptive
## $adaptive$min
## [1] 5
## 
## $adaptive$alpha
## [1] 0.05
## 
## $adaptive$method
## [1] "gls"
## 
## $adaptive$complete
## [1] TRUE
## 
## 
## $trim
## [1] FALSE
## 
## $allowParallel
## [1] TRUE

I chose Kappa as the main classification statistic. Because the oil types are imbalanced, overall accuracy can be misleading since a model may perform well mainly by predicting the majority classes correctly. Kappa is more appropriate here because it adjusts for agreement that could occur by chance and provides a better comparison of model performance under class imbalance.

pp <- preProcess(train_dat[, -1], method = c("center", "scale"))

train_x <- predict(pp, train_dat[, -1])
test_x  <- predict(pp, test_dat[, -1])

train_ready <- data.frame(oilType = train_dat$oilType, train_x)
test_ready  <- data.frame(oilType = test_dat$oilType, test_x)

dim(train_ready)
## [1] 77  8
dim(test_ready)
## [1] 19  8
head(train_ready)
##   oilType  Palmitic     Stearic      Oleic   Linoleic  Linolenic Eicosanoic
## 1       A 0.2725475  0.80268323 -0.3901815  0.3954120 -0.6372155 0.03534662
## 2       A 0.8116414  0.63938968 -0.2631032  0.2128324 -0.6715511 0.03534662
## 4       A 0.3880676  0.47609613 -0.4303114  0.4457788 -0.6715511 0.03534662
## 5       A 1.2352153  0.63938968 -0.3834931  0.2569033 -0.6715511 0.03534662
## 6       A 0.3110542 -0.01378452  0.4124180 -0.4545276  0.0494968 0.03534662
## 7       A 0.5806012  0.63938968 -0.3366748  0.3072701 -0.6372155 0.03534662
##   Eicosenoic
## 1 -0.4905193
## 2 -0.4905193
## 4 -0.4905193
## 5 -0.4905193
## 6  0.5586470
## 7 -0.4905193
set.seed(123)

model_lda <- train(
  oilType ~ .,
  data = train_ready,
  method = "lda",
  trControl = ctrl,
  metric = "Kappa"
)
## Warning in lda.default(x, grouping, ...): group G is empty
## Warning in lda.default(x, grouping, ...): group G is empty
## Warning in lda.default(x, grouping, ...): group G is empty
## Warning in lda.default(x, grouping, ...): group G is empty
## Warning in lda.default(x, grouping, ...): group G is empty
model_lda
## Linear Discriminant Analysis 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9537778  0.9379884
set.seed(123)

model_multinom <- train(
  oilType ~ .,
  data = train_ready,
  method = "multinom",
  trControl = ctrl,
  metric = "Kappa",
  trace = FALSE
)
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
## Warning in nnet::multinom(.outcome ~ ., data = dat, decay = param$decay, :
## group 'G' is empty
model_multinom
## Penalized Multinomial Regression 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results across tuning parameters:
## 
##   decay  Accuracy   Kappa    
##   0e+00  0.9277302  0.9045184
##   1e-04  0.9634603  0.9516678
##   1e-01  0.9367540  0.9160617
## 
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was decay = 1e-04.
pred_lda <- predict(model_lda, newdata = test_ready)
pred_multinom <- predict(model_multinom, newdata = test_ready)

confusionMatrix(pred_lda, test_ready$oilType)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A B C D E F G
##          A 6 0 0 0 0 0 0
##          B 0 5 0 0 0 0 0
##          C 0 0 1 0 0 0 0
##          D 0 0 0 1 0 0 0
##          E 1 0 0 0 2 0 0
##          F 0 0 0 0 0 2 0
##          G 0 0 0 0 0 0 1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9474          
##                  95% CI : (0.7397, 0.9987)
##     No Information Rate : 0.3684          
##     P-Value [Acc > NIR] : 1.934e-07       
##                                           
##                   Kappa : 0.9324          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity            0.8571   1.0000  1.00000  1.00000   1.0000   1.0000
## Specificity            1.0000   1.0000  1.00000  1.00000   0.9412   1.0000
## Pos Pred Value         1.0000   1.0000  1.00000  1.00000   0.6667   1.0000
## Neg Pred Value         0.9231   1.0000  1.00000  1.00000   1.0000   1.0000
## Prevalence             0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Rate         0.3158   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Prevalence   0.3158   0.2632  0.05263  0.05263   0.1579   0.1053
## Balanced Accuracy      0.9286   1.0000  1.00000  1.00000   0.9706   1.0000
##                      Class: G
## Sensitivity           1.00000
## Specificity           1.00000
## Pos Pred Value        1.00000
## Neg Pred Value        1.00000
## Prevalence            0.05263
## Detection Rate        0.05263
## Detection Prevalence  0.05263
## Balanced Accuracy     1.00000
confusionMatrix(pred_multinom, test_ready$oilType)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A B C D E F G
##          A 7 0 0 0 0 0 0
##          B 0 5 0 0 0 0 0
##          C 0 0 1 0 0 0 0
##          D 0 0 0 1 0 0 0
##          E 0 0 0 0 2 0 0
##          F 0 0 0 0 0 2 0
##          G 0 0 0 0 0 0 1
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8235, 1)
##     No Information Rate : 0.3684     
##     P-Value [Acc > NIR] : 5.762e-09  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity            1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
## Specificity            1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
## Prevalence             0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Rate         0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Prevalence   0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Balanced Accuracy      1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
##                      Class: G
## Sensitivity           1.00000
## Specificity           1.00000
## Pos Pred Value        1.00000
## Neg Pred Value        1.00000
## Prevalence            0.05263
## Detection Rate        0.05263
## Detection Prevalence  0.05263
## Balanced Accuracy     1.00000
set.seed(123)

model_pls <- train(
  oilType ~ .,
  data = train_ready,
  method = "pls",
  trControl = ctrl,
  metric = "Kappa",
  tuneLength = 10
)

model_pls
## Partial Least Squares 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  Accuracy   Kappa    
##   1      0.4987778  0.2344625
##   2      0.7480079  0.6340771
##   3      0.8893651  0.8489756
##   4      0.9018651  0.8658174
##   5      0.9110873  0.8810513
##   6      0.9135873  0.8847289
## 
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 6.
pred_pls <- predict(model_pls, newdata = test_ready)

confusionMatrix(pred_pls, test_ready$oilType)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A B C D E F G
##          A 6 0 0 0 0 0 0
##          B 0 5 0 0 0 0 1
##          C 0 0 1 0 0 0 0
##          D 0 0 0 1 0 0 0
##          E 1 0 0 0 2 0 0
##          F 0 0 0 0 0 2 0
##          G 0 0 0 0 0 0 0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8947         
##                  95% CI : (0.6686, 0.987)
##     No Information Rate : 0.3684         
##     P-Value [Acc > NIR] : 3.089e-06      
##                                          
##                   Kappa : 0.8628         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity            0.8571   1.0000  1.00000  1.00000   1.0000   1.0000
## Specificity            1.0000   0.9286  1.00000  1.00000   0.9412   1.0000
## Pos Pred Value         1.0000   0.8333  1.00000  1.00000   0.6667   1.0000
## Neg Pred Value         0.9231   1.0000  1.00000  1.00000   1.0000   1.0000
## Prevalence             0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Rate         0.3158   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Prevalence   0.3158   0.3158  0.05263  0.05263   0.1579   0.1053
## Balanced Accuracy      0.9286   0.9643  1.00000  1.00000   0.9706   1.0000
##                      Class: G
## Sensitivity           0.00000
## Specificity           1.00000
## Pos Pred Value            NaN
## Neg Pred Value        0.94737
## Prevalence            0.05263
## Detection Rate        0.00000
## Detection Prevalence  0.00000
## Balanced Accuracy     0.50000
resamps <- resamples(list(
  LDA = model_lda,
  Multinom = model_multinom,
  PLS = model_pls
))

summary(resamps)
## 
## Call:
## summary.resamples(object = resamps)
## 
## Models: LDA, Multinom, PLS 
## Number of resamples: 50 
## 
## Accuracy 
##               Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## LDA      0.7142857 0.8916667 1.0000000 0.9537778       1    1    0
## Multinom 0.7777778 0.9250000 1.0000000 0.9634603       1    1    0
## PLS      0.6666667 0.8571429 0.9444444 0.9135873       1    1    0
## 
## Kappa 
##               Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## LDA      0.6410256 0.8649038 1.0000000 0.9379884       1    1    0
## Multinom 0.7142857 0.9074074 1.0000000 0.9516678       1    1    0
## PLS      0.5970149 0.8013889 0.9274194 0.8847289       1    1    0
model_compare <- data.frame(
  Model = c("LDA", "Multinom", "PLS"),
  Best_Kappa = c(
    max(model_lda$results$Kappa),
    max(model_multinom$results$Kappa),
    max(model_pls$results$Kappa)
  ),
  Best_Accuracy = c(
    max(model_lda$results$Accuracy),
    max(model_multinom$results$Accuracy),
    max(model_pls$results$Accuracy)
  )
)

model_compare
##      Model Best_Kappa Best_Accuracy
## 1      LDA  0.9379884     0.9537778
## 2 Multinom  0.9516678     0.9634603
## 3      PLS  0.8847289     0.9135873
model_multinom$bestTune
##   decay
## 2 1e-04
model_pls$bestTune
##   ncomp
## 6     6
model_summary <- data.frame(
  Model = c("LDA", "Multinomial Logistic Regression", "PLS"),
  Tuning_Parameter = c("None", "decay = 1e-04", "ncomp = 6"),
  Best_Kappa = c(0.9379884, 0.9516678, 0.8847289),
  Best_Accuracy = c(0.9537778, 0.9634603, 0.9135873)
)

model_summary
##                             Model Tuning_Parameter Best_Kappa Best_Accuracy
## 1                             LDA             None  0.9379884     0.9537778
## 2 Multinomial Logistic Regression    decay = 1e-04  0.9516678     0.9634603
## 3                             PLS        ncomp = 6  0.8847289     0.9135873

1(c) For this problem, I split the data into training and testing sets using a stratified approach so that every oil type was represented in the test set. Because the classes were imbalanced, I used Kappa as the main performance metric instead of overall accuracy. I then pre-processed the predictor variables by centering and scaling them based on the training set and applied the same transformation to the test set. For model tuning, I used repeated 10-fold cross-validation with 5 repeats on the training set.

The linear classification models under consideration were Linear Discriminant Analysis (LDA), multinomial logistic regression, and Partial Least Squares (PLS). LDA has no main tuning parameter in this setup. For multinomial logistic regression, the tuning parameter was the decay value, and the best value was 1e-04. For PLS, the tuning parameter was the number of components, and the best value was 6.

1(d) Among the linear models that I considered, multinomial logistic regression performed the best because it achieved the largest cross-validated Kappa value (0.9516678), which was higher than LDA (0.9379884) and PLS (0.8847289). On the test set, the multinomial logistic regression model correctly classified all observations, giving an accuracy of 1.00 and a Kappa of 1.00.

Based on the test-set results for the best model, all seven oil types were classified correctly, so no single oil type was least accurate in this split. In contrast, PLS showed weaker performance, and oil type G was the least accurately predicted class for that model.

Problem 2 2(a)

set.seed(123)

model_knn <- train(
  oilType ~ .,
  data = train_ready,
  method = "knn",
  trControl = ctrl,
  metric = "Kappa",
  tuneLength = 10
)

model_knn
## k-Nearest Neighbors 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9367222  0.9146828
##    7  0.9331111  0.9084968
##    9  0.9334683  0.9101174
##   11  0.9091667  0.8768951
##   13  0.8993651  0.8638341
##   15  0.8403810  0.7797111
##   17  0.7530635  0.6404376
##   19  0.7396508  0.6185438
##   21  0.7144524  0.5777985
##   23  0.6793175  0.5272940
## 
## Kappa was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
set.seed(123)

model_svm_radial <- train(
  oilType ~ .,
  data = train_ready,
  method = "svmRadial",
  trControl = ctrl,
  metric = "Kappa",
  tuneLength = 10
)

model_svm_radial
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results across tuning parameters:
## 
##   C       Accuracy   Kappa    
##     0.25  0.7991349  0.7063598
##     0.50  0.9090873  0.8732239
##     1.00  0.9090873  0.8732239
##     2.00  0.9174206  0.8851848
##     4.00  0.9310714  0.9039786
##     8.00  0.9285714  0.9005743
##    16.00  0.9285714  0.9005743
##    32.00  0.9285714  0.9005743
##    64.00  0.9285714  0.9005743
##   128.00  0.9285714  0.9005743
## 
## Tuning parameter 'sigma' was held constant at a value of 0.4865078
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.4865078 and C = 4.
set.seed(123)

model_nnet <- train(
  oilType ~ .,
  data = train_ready,
  method = "nnet",
  trControl = ctrl,
  metric = "Kappa",
  tuneLength = 10,
  trace = FALSE,
  MaxNWts = 5000
)

model_nnet
## Neural Network 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results across tuning parameters:
## 
##   size  decay         Accuracy   Kappa    
##    1    0.0000000000  0.7058810  0.5822759
##    1    0.0001000000  0.7628492  0.6675961
##    1    0.0002371374  0.7780952  0.6883113
##    1    0.0005623413  0.8097381  0.7366117
##    1    0.0013335214  0.8281270  0.7635036
##    1    0.0031622777  0.8092937  0.7368155
##    1    0.0074989421  0.7869365  0.7037891
##    1    0.0177827941  0.7309841  0.6223201
##    1    0.0421696503  0.6568810  0.5073696
##    1    0.1000000000  0.6410079  0.4853734
##    3    0.0000000000  0.9172381  0.8919066
##    3    0.0001000000  0.9482302  0.9313838
##    3    0.0002371374  0.9205873  0.8952593
##    3    0.0005623413  0.9486587  0.9321224
##    3    0.0013335214  0.9449524  0.9275339
##    3    0.0031622777  0.9663968  0.9563841
##    3    0.0074989421  0.9589603  0.9462798
##    3    0.0177827941  0.9567381  0.9430963
##    3    0.0421696503  0.9533810  0.9385795
##    3    0.1000000000  0.9393016  0.9179874
##    5    0.0000000000  0.9387460  0.9198635
##    5    0.0001000000  0.9634603  0.9519460
##    5    0.0002371374  0.9589603  0.9463180
##    5    0.0005623413  0.9594603  0.9462341
##    5    0.0013335214  0.9710397  0.9620692
##    5    0.0031622777  0.9661825  0.9556910
##    5    0.0074989421  0.9684048  0.9580480
##    5    0.0177827941  0.9690397  0.9594635
##    5    0.0421696503  0.9741190  0.9658388
##    5    0.1000000000  0.9573889  0.9420098
##    7    0.0000000000  0.9551111  0.9399783
##    7    0.0001000000  0.9516587  0.9367601
##    7    0.0002371374  0.9713175  0.9628283
##    7    0.0005623413  0.9690397  0.9597356
##    7    0.0013335214  0.9715397  0.9628626
##    7    0.0031622777  0.9715397  0.9629563
##    7    0.0074989421  0.9690397  0.9593673
##    7    0.0177827941  0.9737619  0.9651078
##    7    0.0421696503  0.9737619  0.9652764
##    7    0.1000000000  0.9553889  0.9400566
##    9    0.0000000000  0.9606032  0.9489063
##    9    0.0001000000  0.9713175  0.9627530
##    9    0.0002371374  0.9735397  0.9652779
##    9    0.0005623413  0.9735397  0.9653027
##    9    0.0013335214  0.9710397  0.9621840
##    9    0.0031622777  0.9715397  0.9631715
##    9    0.0074989421  0.9757619  0.9685595
##    9    0.0177827941  0.9715397  0.9627267
##    9    0.0421696503  0.9737619  0.9654915
##    9    0.1000000000  0.9576111  0.9431990
##   11    0.0000000000  0.9590159  0.9462757
##   11    0.0001000000  0.9589603  0.9454764
##   11    0.0002371374  0.9710397  0.9623501
##   11    0.0005623413  0.9684603  0.9587654
##   11    0.0013335214  0.9735397  0.9653027
##   11    0.0031622777  0.9690397  0.9597935
##   11    0.0074989421  0.9757619  0.9680670
##   11    0.0177827941  0.9737619  0.9658897
##   11    0.0421696503  0.9757619  0.9684507
##   11    0.1000000000  0.9601111  0.9462382
##   13    0.0000000000  0.9710397  0.9613404
##   13    0.0001000000  0.9665397  0.9559111
##   13    0.0002371374  0.9737619  0.9660049
##   13    0.0005623413  0.9737619  0.9657741
##   13    0.0013335214  0.9712619  0.9621553
##   13    0.0031622777  0.9732619  0.9647191
##   13    0.0074989421  0.9737619  0.9655361
##   13    0.0177827941  0.9737619  0.9658897
##   13    0.0421696503  0.9757619  0.9684507
##   13    0.1000000000  0.9621111  0.9487991
##   15    0.0000000000  0.9583254  0.9451744
##   15    0.0001000000  0.9732619  0.9652495
##   15    0.0002371374  0.9715397  0.9628399
##   15    0.0005623413  0.9712619  0.9618349
##   15    0.0013335214  0.9757619  0.9684060
##   15    0.0031622777  0.9757619  0.9683050
##   15    0.0074989421  0.9737619  0.9658897
##   15    0.0177827941  0.9737619  0.9658897
##   15    0.0421696503  0.9737619  0.9658897
##   15    0.1000000000  0.9621111  0.9504881
##   17    0.0000000000  0.9713175  0.9627370
##   17    0.0001000000  0.9710397  0.9616710
##   17    0.0002371374  0.9757619  0.9685148
##   17    0.0005623413  0.9757619  0.9681914
##   17    0.0013335214  0.9737619  0.9659840
##   17    0.0031622777  0.9737619  0.9658450
##   17    0.0074989421  0.9757619  0.9684507
##   17    0.0177827941  0.9737619  0.9658897
##   17    0.0421696503  0.9757619  0.9684507
##   17    0.1000000000  0.9621111  0.9487448
##   19    0.0000000000  0.9660397  0.9540151
##   19    0.0001000000  0.9729048  0.9643050
##   19    0.0002371374  0.9757619  0.9685595
##   19    0.0005623413  0.9757619  0.9685450
##   19    0.0013335214  0.9709048  0.9617516
##   19    0.0031622777  0.9737619  0.9658897
##   19    0.0074989421  0.9757619  0.9684060
##   19    0.0177827941  0.9757619  0.9680971
##   19    0.0421696503  0.9757619  0.9684507
##   19    0.1000000000  0.9621111  0.9501346
## 
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were size = 9 and decay = 0.007498942.
pred_nnet <- predict(model_nnet, newdata = test_ready)

confusionMatrix(pred_nnet, test_ready$oilType)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A B C D E F G
##          A 7 0 0 0 0 0 0
##          B 0 5 0 0 0 0 0
##          C 0 0 1 0 0 0 0
##          D 0 0 0 1 0 0 0
##          E 0 0 0 0 2 0 0
##          F 0 0 0 0 0 2 0
##          G 0 0 0 0 0 0 1
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.8235, 1)
##     No Information Rate : 0.3684     
##     P-Value [Acc > NIR] : 5.762e-09  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity            1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
## Specificity            1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
## Prevalence             0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Rate         0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Prevalence   0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Balanced Accuracy      1.0000   1.0000  1.00000  1.00000   1.0000   1.0000
##                      Class: G
## Sensitivity           1.00000
## Specificity           1.00000
## Pos Pred Value        1.00000
## Neg Pred Value        1.00000
## Prevalence            0.05263
## Detection Rate        0.05263
## Detection Prevalence  0.05263
## Balanced Accuracy     1.00000
resamps_nl <- resamples(list(
  kNN = model_knn,
  SVM_Radial = model_svm_radial,
  NNet = model_nnet
))

summary(resamps_nl)
## 
## Call:
## summary.resamples(object = resamps_nl)
## 
## Models: kNN, SVM_Radial, NNet 
## Number of resamples: 50 
## 
## Accuracy 
##                 Min. 1st Qu. Median      Mean 3rd Qu. Max. NA's
## kNN        0.6666667   0.875      1 0.9367222       1    1    0
## SVM_Radial 0.7000000   0.875      1 0.9310714       1    1    0
## NNet       0.7777778   1.000      1 0.9757619       1    1    0
## 
## Kappa 
##                 Min.   1st Qu. Median      Mean 3rd Qu. Max. NA's
## kNN        0.5645161 0.8341837      1 0.9146828       1    1    0
## SVM_Radial 0.6052632 0.8260870      1 0.9039786       1    1    0
## NNet       0.7272727 1.0000000      1 0.9685595       1    1    0

2(a) For Problem 2, I used the same training/testing split, the same centering and scaling steps, and the same performance metric (Kappa) as in Problem 1. I then fit several nonlinear classification models from Chapter 13, including k-nearest neighbors, support vector machines with a radial basis kernel, and neural networks.

Among these nonlinear models, the neural network had the best predictive ability. Its best cross-validated Kappa was 0.9685595, which was higher than kNN (0.9146828) and SVM with radial kernel (0.9039786). The best neural network model used size = 9 and decay = 0.007498942. This nonlinear model also performed better than the best linear model from Problem 1, which was multinomial logistic regression with a cross-validated Kappa of 0.9516678.

2(b) Based on this comparison, I would infer that the data likely have at least some nonlinear separation boundaries, because the best nonlinear model (the neural network) performed slightly better than the best linear model (multinomial logistic regression). However, the difference was not extremely large, and the linear model already performed very well. So the results suggest that some nonlinear structure may be present, but the classes are also fairly well separated by linear methods.

2(c) For the optimal nonlinear model, which was the neural network, all oil types in the test set were classified correctly. Therefore, no single oil type was most accurate or least accurate in this particular test split, since all classes achieved perfect classification.

Problem 3

library(modeldata)
## 
## Attaching package: 'modeldata'
## The following object is masked from 'package:datasets':
## 
##     penguins
data(mlc_churn)

str(mlc_churn)
## tibble [5,000 × 20] (S3: tbl_df/tbl/data.frame)
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int [1:5000] 128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int [1:5000] 25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num [1:5000] 265 162 243 299 167 ...
##  $ total_day_calls              : int [1:5000] 110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num [1:5000] 45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num [1:5000] 197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int [1:5000] 99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num [1:5000] 16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num [1:5000] 245 254 163 197 187 ...
##  $ total_night_calls            : int [1:5000] 91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num [1:5000] 11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num [1:5000] 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int [1:5000] 3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num [1:5000] 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int [1:5000] 1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
dim(mlc_churn)
## [1] 5000   20
table(mlc_churn$churn)
## 
##  yes   no 
##  707 4293
prop.table(table(mlc_churn$churn))
## 
##    yes     no 
## 0.1414 0.8586

3(a)

colSums(is.na(mlc_churn))
##                         state                account_length 
##                             0                             0 
##                     area_code            international_plan 
##                             0                             0 
##               voice_mail_plan         number_vmail_messages 
##                             0                             0 
##             total_day_minutes               total_day_calls 
##                             0                             0 
##              total_day_charge             total_eve_minutes 
##                             0                             0 
##               total_eve_calls              total_eve_charge 
##                             0                             0 
##           total_night_minutes             total_night_calls 
##                             0                             0 
##            total_night_charge            total_intl_minutes 
##                             0                             0 
##              total_intl_calls             total_intl_charge 
##                             0                             0 
## number_customer_service_calls                         churn 
##                             0                             0
sapply(mlc_churn, class)
##                         state                account_length 
##                      "factor"                     "integer" 
##                     area_code            international_plan 
##                      "factor"                      "factor" 
##               voice_mail_plan         number_vmail_messages 
##                      "factor"                     "integer" 
##             total_day_minutes               total_day_calls 
##                     "numeric"                     "integer" 
##              total_day_charge             total_eve_minutes 
##                     "numeric"                     "numeric" 
##               total_eve_calls              total_eve_charge 
##                     "integer"                     "numeric" 
##           total_night_minutes             total_night_calls 
##                     "numeric"                     "integer" 
##            total_night_charge            total_intl_minutes 
##                     "numeric"                     "numeric" 
##              total_intl_calls             total_intl_charge 
##                     "integer"                     "numeric" 
## number_customer_service_calls                         churn 
##                     "integer"                      "factor"
num_vars <- mlc_churn[, sapply(mlc_churn, is.numeric)]

cor_mat <- cor(num_vars)

round(cor_mat, 2)
##                               account_length number_vmail_messages
## account_length                          1.00                 -0.01
## number_vmail_messages                  -0.01                  1.00
## total_day_minutes                       0.00                  0.01
## total_day_calls                         0.03                  0.00
## total_day_charge                        0.00                  0.01
## total_eve_minutes                      -0.01                  0.02
## total_eve_calls                         0.01                  0.00
## total_eve_charge                       -0.01                  0.02
## total_night_minutes                     0.00                  0.01
## total_night_calls                      -0.01                  0.00
## total_night_charge                      0.00                  0.01
## total_intl_minutes                      0.00                  0.00
## total_intl_calls                        0.01                  0.00
## total_intl_charge                       0.00                  0.00
## number_customer_service_calls           0.00                 -0.01
##                               total_day_minutes total_day_calls
## account_length                             0.00            0.03
## number_vmail_messages                      0.01            0.00
## total_day_minutes                          1.00            0.00
## total_day_calls                            0.00            1.00
## total_day_charge                           1.00            0.00
## total_eve_minutes                         -0.01            0.00
## total_eve_calls                            0.01            0.00
## total_eve_charge                          -0.01            0.00
## total_night_minutes                        0.01            0.00
## total_night_calls                          0.00           -0.01
## total_night_charge                         0.01            0.00
## total_intl_minutes                        -0.02            0.01
## total_intl_calls                           0.00            0.01
## total_intl_charge                         -0.02            0.01
## number_customer_service_calls              0.00           -0.01
##                               total_day_charge total_eve_minutes
## account_length                            0.00             -0.01
## number_vmail_messages                     0.01              0.02
## total_day_minutes                         1.00             -0.01
## total_day_calls                           0.00              0.00
## total_day_charge                          1.00             -0.01
## total_eve_minutes                        -0.01              1.00
## total_eve_calls                           0.01              0.00
## total_eve_charge                         -0.01              1.00
## total_night_minutes                       0.01             -0.02
## total_night_calls                         0.00              0.01
## total_night_charge                        0.01             -0.02
## total_intl_minutes                       -0.02              0.00
## total_intl_calls                          0.00              0.01
## total_intl_charge                        -0.02              0.00
## number_customer_service_calls             0.00             -0.01
##                               total_eve_calls total_eve_charge
## account_length                           0.01            -0.01
## number_vmail_messages                    0.00             0.02
## total_day_minutes                        0.01            -0.01
## total_day_calls                          0.00             0.00
## total_day_charge                         0.01            -0.01
## total_eve_minutes                        0.00             1.00
## total_eve_calls                          1.00             0.00
## total_eve_charge                         0.00             1.00
## total_night_minutes                      0.00            -0.02
## total_night_calls                       -0.01             0.01
## total_night_charge                       0.00            -0.02
## total_intl_minutes                      -0.01             0.00
## total_intl_calls                         0.01             0.01
## total_intl_charge                       -0.01             0.00
## number_customer_service_calls            0.01            -0.01
##                               total_night_minutes total_night_calls
## account_length                               0.00             -0.01
## number_vmail_messages                        0.01              0.00
## total_day_minutes                            0.01              0.00
## total_day_calls                              0.00             -0.01
## total_day_charge                             0.01              0.00
## total_eve_minutes                           -0.02              0.01
## total_eve_calls                              0.00             -0.01
## total_eve_charge                            -0.02              0.01
## total_night_minutes                          1.00              0.03
## total_night_calls                            0.03              1.00
## total_night_charge                           1.00              0.03
## total_intl_minutes                          -0.01              0.00
## total_intl_calls                            -0.02              0.00
## total_intl_charge                           -0.01              0.00
## number_customer_service_calls               -0.01             -0.01
##                               total_night_charge total_intl_minutes
## account_length                              0.00               0.00
## number_vmail_messages                       0.01               0.00
## total_day_minutes                           0.01              -0.02
## total_day_calls                             0.00               0.01
## total_day_charge                            0.01              -0.02
## total_eve_minutes                          -0.02               0.00
## total_eve_calls                             0.00              -0.01
## total_eve_charge                           -0.02               0.00
## total_night_minutes                         1.00              -0.01
## total_night_calls                           0.03               0.00
## total_night_charge                          1.00              -0.01
## total_intl_minutes                         -0.01               1.00
## total_intl_calls                           -0.02               0.02
## total_intl_charge                          -0.01               1.00
## number_customer_service_calls              -0.01              -0.01
##                               total_intl_calls total_intl_charge
## account_length                            0.01              0.00
## number_vmail_messages                     0.00              0.00
## total_day_minutes                         0.00             -0.02
## total_day_calls                           0.01              0.01
## total_day_charge                          0.00             -0.02
## total_eve_minutes                         0.01              0.00
## total_eve_calls                           0.01             -0.01
## total_eve_charge                          0.01              0.00
## total_night_minutes                      -0.02             -0.01
## total_night_calls                         0.00              0.00
## total_night_charge                       -0.02             -0.01
## total_intl_minutes                        0.02              1.00
## total_intl_calls                          1.00              0.02
## total_intl_charge                         0.02              1.00
## number_customer_service_calls            -0.02             -0.01
##                               number_customer_service_calls
## account_length                                         0.00
## number_vmail_messages                                 -0.01
## total_day_minutes                                      0.00
## total_day_calls                                       -0.01
## total_day_charge                                       0.00
## total_eve_minutes                                     -0.01
## total_eve_calls                                        0.01
## total_eve_charge                                      -0.01
## total_night_minutes                                   -0.01
## total_night_calls                                     -0.01
## total_night_charge                                    -0.01
## total_intl_minutes                                    -0.01
## total_intl_calls                                      -0.02
## total_intl_charge                                     -0.01
## number_customer_service_calls                          1.00
findCorrelation(cor_mat, cutoff = 0.90, names = TRUE)
## [1] "total_night_minutes" "total_eve_charge"    "total_day_charge"   
## [4] "total_intl_charge"
library(ggplot2)

ggplot(mlc_churn, aes(x = churn, y = total_day_minutes)) +
  geom_boxplot()

ggplot(mlc_churn, aes(x = churn, y = number_customer_service_calls)) +
  geom_boxplot()

ggplot(mlc_churn, aes(x = international_plan, fill = churn)) +
  geom_bar(position = "fill")

ggplot(mlc_churn, aes(x = voice_mail_plan, fill = churn)) +
  geom_bar(position = "fill")

3(b) I explored the churn data by examining class balance, predictor types, correlations among numeric predictors, and several visual relationships between predictors and the outcome. The response variable was imbalanced, with far more “no” cases than “yes” cases. There were no missing values in the data, and the predictors included both numeric and categorical variables.

The exploratory plots suggested that several predictors were related to churn. Customers who churned tended to have higher total day minutes and more customer service calls. The proportion of churn was also higher among customers with an international plan, while customers with a voice mail plan appeared less likely to churn.

There were also strong between-predictor correlations. In particular, total day minutes and total day charge, total evening minutes and total evening charge, total night minutes and total night charge, and total international minutes and total international charge were essentially carrying the same information. Because of this redundancy, it was reasonable to remove the charge variables before modeling. These findings also suggest that nonlinear or tree-based methods may work well, since the relationship between churn and the predictors likely depends on combinations of variables such as usage patterns, service calls, and plan types.

3(c) I split the churn data into training and testing sets using stratified sampling based on the response variable so that the class proportions of “yes” and “no” were preserved in both sets. I used 80% of the data for training and 20% for testing.

For pre-processing, I first examined the numeric predictors for redundancy. The exploratory analysis showed that the charge variables were almost perfectly correlated with the corresponding minute variables, so I removed the charge variables before modeling. Specifically, I removed total_day_charge, total_eve_charge, total_night_charge, and total_intl_charge. I also removed number_vmail_messages because it behaved like a near-zero-variance predictor in the training data. After these steps, I used the reduced training and testing data sets for model fitting and evaluation.

set.seed(123)

churn_split <- createDataPartition(mlc_churn$churn, p = 0.80, list = FALSE)

churn_train <- mlc_churn[churn_split, ]
churn_test  <- mlc_churn[-churn_split, ]

table(churn_train$churn)
## 
##  yes   no 
##  566 3435
prop.table(table(churn_train$churn))
## 
##       yes        no 
## 0.1414646 0.8585354
table(churn_test$churn)
## 
## yes  no 
## 141 858
prop.table(table(churn_test$churn))
## 
##       yes        no 
## 0.1411411 0.8588589
churn_train_red <- churn_train[, !(names(churn_train) %in% c(
  "total_day_charge",
  "total_eve_charge",
  "total_night_charge",
  "total_intl_charge"
))]

churn_test_red <- churn_test[, !(names(churn_test) %in% c(
  "total_day_charge",
  "total_eve_charge",
  "total_night_charge",
  "total_intl_charge"
))]

dim(churn_train_red)
## [1] 4001   16
dim(churn_test_red)
## [1] 999  16
names(churn_train_red)
##  [1] "state"                         "account_length"               
##  [3] "area_code"                     "international_plan"           
##  [5] "voice_mail_plan"               "number_vmail_messages"        
##  [7] "total_day_minutes"             "total_day_calls"              
##  [9] "total_eve_minutes"             "total_eve_calls"              
## [11] "total_night_minutes"           "total_night_calls"            
## [13] "total_intl_minutes"            "total_intl_calls"             
## [15] "number_customer_service_calls" "churn"
nearZeroVar(churn_train_red, saveMetrics = TRUE)
##                               freqRatio percentUnique zeroVar   nzv
## state                          1.281553    1.27468133   FALSE FALSE
## account_length                 1.020408    5.44863784   FALSE FALSE
## area_code                      2.009980    0.07498125   FALSE FALSE
## international_plan             9.755376    0.04998750   FALSE FALSE
## voice_mail_plan                2.839731    0.04998750   FALSE FALSE
## number_vmail_messages         46.984127    1.19970007   FALSE  TRUE
## total_day_minutes              1.000000   44.56385904   FALSE FALSE
## total_day_calls                1.088889    3.02424394   FALSE FALSE
## total_eve_minutes              1.111111   43.11422144   FALSE FALSE
## total_eve_calls                1.043956    3.04923769   FALSE FALSE
## total_night_minutes            1.111111   42.81429643   FALSE FALSE
## total_night_calls              1.079545    3.19920020   FALSE FALSE
## total_intl_minutes             1.014286    4.12396901   FALSE FALSE
## total_intl_calls               1.065160    0.52486878   FALSE FALSE
## number_customer_service_calls  1.592675    0.22494376   FALSE FALSE
## churn                          6.068905    0.04998750   FALSE FALSE
ctrl_churn <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 3,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

ctrl_churn
## $method
## [1] "repeatedcv"
## 
## $number
## [1] 10
## 
## $repeats
## [1] 3
## 
## $search
## [1] "grid"
## 
## $p
## [1] 0.75
## 
## $initialWindow
## NULL
## 
## $horizon
## [1] 1
## 
## $fixedWindow
## [1] TRUE
## 
## $skip
## [1] 0
## 
## $verboseIter
## [1] FALSE
## 
## $returnData
## [1] TRUE
## 
## $returnResamp
## [1] "final"
## 
## $savePredictions
## [1] FALSE
## 
## $classProbs
## [1] TRUE
## 
## $summaryFunction
## function (data, lev = NULL, model = NULL) 
## {
##     if (length(lev) > 2) {
##         stop(paste("Your outcome has", length(lev), "levels. The twoClassSummary() function isn't appropriate."))
##     }
##     requireNamespaceQuietStop("pROC")
##     if (!all(levels(data[, "pred"]) == lev)) {
##         stop("levels of observed and predicted data do not match")
##     }
##     rocObject <- try(pROC::roc(data$obs, data[, lev[1]], direction = ">", 
##         quiet = TRUE), silent = TRUE)
##     rocAUC <- if (inherits(rocObject, "try-error")) 
##         NA
##     else rocObject$auc
##     out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"], 
##         lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
##     names(out) <- c("ROC", "Sens", "Spec")
##     out
## }
## <bytecode: 0x149687440>
## <environment: namespace:caret>
## 
## $selectionFunction
## [1] "best"
## 
## $preProcOptions
## $preProcOptions$thresh
## [1] 0.95
## 
## $preProcOptions$ICAcomp
## [1] 3
## 
## $preProcOptions$k
## [1] 5
## 
## $preProcOptions$freqCut
## [1] 19
## 
## $preProcOptions$uniqueCut
## [1] 10
## 
## $preProcOptions$cutoff
## [1] 0.9
## 
## 
## $sampling
## NULL
## 
## $index
## NULL
## 
## $indexOut
## NULL
## 
## $indexFinal
## NULL
## 
## $timingSamps
## [1] 0
## 
## $predictionBounds
## [1] FALSE FALSE
## 
## $seeds
## [1] NA
## 
## $adaptive
## $adaptive$min
## [1] 5
## 
## $adaptive$alpha
## [1] 0.05
## 
## $adaptive$method
## [1] "gls"
## 
## $adaptive$complete
## [1] TRUE
## 
## 
## $trim
## [1] FALSE
## 
## $allowParallel
## [1] TRUE
churn_train_final <- subset(churn_train_red, select = -number_vmail_messages)
churn_test_final  <- subset(churn_test_red,  select = -number_vmail_messages)

set.seed(123)

model_glm_churn <- train(
  churn ~ .,
  data = churn_train_final,
  method = "glm",
  family = binomial(),
  trControl = ctrl_churn,
  metric = "ROC"
)

model_glm_churn
## Generalized Linear Model 
## 
## 4001 samples
##   14 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 3600, 3601, 3602, 3602, 3600, 3601, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8181677  0.2498225  0.9702093
set.seed(123)

model_rpart_churn <- train(
  churn ~ .,
  data = churn_train_final,
  method = "rpart",
  trControl = ctrl_churn,
  metric = "ROC",
  tuneLength = 10
)

model_rpart_churn
## CART 
## 
## 4001 samples
##   14 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 3600, 3601, 3602, 3602, 3600, 3601, ... 
## Resampling results across tuning parameters:
## 
##   cp           ROC        Sens       Spec     
##   0.008833922  0.8905228  0.6937865  0.9800094
##   0.010600707  0.8912441  0.6937761  0.9791362
##   0.012367491  0.8869451  0.6766500  0.9799128
##   0.014487633  0.8703704  0.6407268  0.9809801
##   0.015901060  0.8535762  0.6041876  0.9841831
##   0.022084806  0.8402261  0.5818505  0.9878681
##   0.051236749  0.7722043  0.4858292  0.9804947
##   0.066254417  0.6723040  0.3257832  0.9799091
##   0.081272085  0.6212854  0.2709482  0.9737002
##   0.106007067  0.5727409  0.1673141  0.9781677
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01060071.
pred_rpart_class <- predict(model_rpart_churn, newdata = churn_test_final)
confusionMatrix(pred_rpart_class, churn_test_final$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction yes  no
##        yes 107  16
##        no   34 842
##                                           
##                Accuracy : 0.9499          
##                  95% CI : (0.9345, 0.9626)
##     No Information Rate : 0.8589          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.7819          
##                                           
##  Mcnemar's Test P-Value : 0.01621         
##                                           
##             Sensitivity : 0.7589          
##             Specificity : 0.9814          
##          Pos Pred Value : 0.8699          
##          Neg Pred Value : 0.9612          
##              Prevalence : 0.1411          
##          Detection Rate : 0.1071          
##    Detection Prevalence : 0.1231          
##       Balanced Accuracy : 0.8701          
##                                           
##        'Positive' Class : yes             
## 
pred_rpart_prob <- predict(model_rpart_churn, newdata = churn_test_final, type = "prob")
twoClassSummary(data.frame(
  obs = churn_test_final$churn,
  pred = pred_rpart_class,
  yes = pred_rpart_prob$yes,
  no = pred_rpart_prob$no
))
##  ROC Sens Spec 
##   NA   NA   NA
ctrl_churn_fast <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary
)

set.seed(123)

model_c50_churn <- train(
  churn ~ .,
  data = churn_train_final,
  method = "C5.0",
  trControl = ctrl_churn_fast,
  metric = "ROC",
  tuneGrid = expand.grid(
    model = "tree",
    trials = c(1, 10),
    winnow = FALSE
  )
)

model_c50_churn
## C5.0 
## 
## 4001 samples
##   14 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 3201, 3201, 3200, 3201, 3201 
## Resampling results across tuning parameters:
## 
##   trials  ROC        Sens       Spec     
##    1      0.9000241  0.6873156  0.9819505
##   10      0.9065020  0.7208974  0.9857351
## 
## Tuning parameter 'model' was held constant at a value of tree
## Tuning
##  parameter 'winnow' was held constant at a value of FALSE
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were trials = 10, model = tree and winnow
##  = FALSE.
pred_c50_class <- predict(model_c50_churn, newdata = churn_test_final)
confusionMatrix(pred_c50_class, churn_test_final$churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction yes  no
##        yes 118   4
##        no   23 854
##                                           
##                Accuracy : 0.973           
##                  95% CI : (0.9609, 0.9821)
##     No Information Rate : 0.8589          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8819          
##                                           
##  Mcnemar's Test P-Value : 0.000532        
##                                           
##             Sensitivity : 0.8369          
##             Specificity : 0.9953          
##          Pos Pred Value : 0.9672          
##          Neg Pred Value : 0.9738          
##              Prevalence : 0.1411          
##          Detection Rate : 0.1181          
##    Detection Prevalence : 0.1221          
##       Balanced Accuracy : 0.9161          
##                                           
##        'Positive' Class : yes             
## 
problem3_compare <- data.frame(
  Model = c("Logistic Regression", "CART (rpart)", "C5.0"),
  CV_ROC = c(
    0.8181677,
    0.8912441,
    0.9065020
  ),
  Test_Accuracy = c(
    NA,
    0.9499,
    0.9730
  ),
  Test_Kappa = c(
    NA,
    0.7819,
    0.8819
  )
)

problem3_compare
##                 Model    CV_ROC Test_Accuracy Test_Kappa
## 1 Logistic Regression 0.8181677            NA         NA
## 2        CART (rpart) 0.8912441        0.9499     0.7819
## 3                C5.0 0.9065020        0.9730     0.8819

3(d) I compared several classification models for the churn data, including logistic regression, CART, and C5.0. Logistic regression served as a baseline model and achieved a cross-validated ROC of 0.8181677. CART improved on this result, with a best cross-validated ROC of 0.8912441. On the test set, CART achieved an accuracy of 0.9499 and a Kappa of 0.7819.

Among the models I tried, C5.0 had the best predictive performance. Its best cross-validated ROC was 0.9065020, which was higher than both logistic regression and CART. On the test set, C5.0 also performed best, with an accuracy of 0.9730, a Kappa of 0.8819, a sensitivity of 0.8369, and a specificity of 0.9953. Therefore, yes, other models did have better predictive performance, and C5.0 was the strongest model among the ones I tested.

Problem 4

library(caret)
data(oil)

set.seed(123)

oil_dat <- data.frame(oilType = oilType, fattyAcids)

test_idx <- unlist(lapply(split(seq_len(nrow(oil_dat)), oil_dat$oilType), function(idx) {
  sample(idx, size = max(1, floor(0.20 * length(idx))))
}))

train_dat <- oil_dat[-test_idx, ]
test_dat  <- oil_dat[test_idx, ]

table(train_dat$oilType)
## 
##  A  B  C  D  E  F  G 
## 30 21  2  6  9  8  1
table(test_dat$oilType)
## 
## A B C D E F G 
## 7 5 1 1 2 2 1

4(a)

set.seed(123)

model_tree <- train(
  oilType ~ .,
  data = train_dat,
  method = "rpart",
  trControl = trainControl(method = "cv", number = 10)
)

model_tree
## CART 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results across tuning parameters:
## 
##   cp         Accuracy   Kappa    
##   0.0000000  0.8499603  0.8016579
##   0.1702128  0.7154365  0.5885568
##   0.4468085  0.4922619  0.1591270
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
pred_tree <- predict(model_tree, newdata = test_dat)
confusionMatrix(pred_tree, test_dat$oilType)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A B C D E F G
##          A 7 0 0 0 0 0 0
##          B 0 5 0 0 0 0 0
##          C 0 0 0 0 0 0 0
##          D 0 0 0 0 0 0 0
##          E 0 0 0 0 2 0 1
##          F 0 0 1 1 0 2 0
##          G 0 0 0 0 0 0 0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8421          
##                  95% CI : (0.6042, 0.9662)
##     No Information Rate : 0.3684          
##     P-Value [Acc > NIR] : 3.122e-05       
##                                           
##                   Kappa : 0.7912          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity            1.0000   1.0000  0.00000  0.00000   1.0000   1.0000
## Specificity            1.0000   1.0000  1.00000  1.00000   0.9412   0.8824
## Pos Pred Value         1.0000   1.0000      NaN      NaN   0.6667   0.5000
## Neg Pred Value         1.0000   1.0000  0.94737  0.94737   1.0000   1.0000
## Prevalence             0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Rate         0.3684   0.2632  0.00000  0.00000   0.1053   0.1053
## Detection Prevalence   0.3684   0.2632  0.00000  0.00000   0.1579   0.2105
## Balanced Accuracy      1.0000   1.0000  0.50000  0.50000   0.9706   0.9412
##                      Class: G
## Sensitivity           0.00000
## Specificity           1.00000
## Pos Pred Value            NaN
## Neg Pred Value        0.94737
## Prevalence            0.05263
## Detection Rate        0.00000
## Detection Prevalence  0.00000
## Balanced Accuracy     0.50000

4(b)

set.seed(123)

model_bag <- train(
  oilType ~ .,
  data = train_dat,
  method = "treebag",
  trControl = trainControl(method = "cv", number = 10)
)

model_bag
## Bagged CART 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9260317  0.9034891
pred_bag <- predict(model_bag, newdata = test_dat)
confusionMatrix(pred_bag, test_dat$oilType)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A B C D E F G
##          A 7 0 0 0 0 0 0
##          B 0 5 0 0 0 0 0
##          C 0 0 0 0 0 0 0
##          D 0 0 0 1 0 0 0
##          E 0 0 0 0 2 0 1
##          F 0 0 0 0 0 2 0
##          G 0 0 1 0 0 0 0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8947         
##                  95% CI : (0.6686, 0.987)
##     No Information Rate : 0.3684         
##     P-Value [Acc > NIR] : 3.089e-06      
##                                          
##                   Kappa : 0.8618         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity            1.0000   1.0000  0.00000  1.00000   1.0000   1.0000
## Specificity            1.0000   1.0000  1.00000  1.00000   0.9412   1.0000
## Pos Pred Value         1.0000   1.0000      NaN  1.00000   0.6667   1.0000
## Neg Pred Value         1.0000   1.0000  0.94737  1.00000   1.0000   1.0000
## Prevalence             0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Rate         0.3684   0.2632  0.00000  0.05263   0.1053   0.1053
## Detection Prevalence   0.3684   0.2632  0.00000  0.05263   0.1579   0.1053
## Balanced Accuracy      1.0000   1.0000  0.50000  1.00000   0.9706   1.0000
##                      Class: G
## Sensitivity           0.00000
## Specificity           0.94444
## Pos Pred Value        0.00000
## Neg Pred Value        0.94444
## Prevalence            0.05263
## Detection Rate        0.00000
## Detection Prevalence  0.05263
## Balanced Accuracy     0.47222
set.seed(123)

model_boost <- train(
  oilType ~ .,
  data = train_dat,
  method = "C5.0",
  trControl = trainControl(method = "cv", number = 10),
  tuneGrid = expand.grid(
    model = "tree",
    trials = c(1, 10, 20),
    winnow = FALSE
  )
)
## Warning: 'trials' should be <= 4 for this object. Predictions generated using 4
## trials
## Warning: 'trials' should be <= 4 for this object. Predictions generated using 4
## trials
## Warning: 'trials' should be <= 4 for this object. Predictions generated using 4
## trials
## Warning: 'trials' should be <= 4 for this object. Predictions generated using 4
## trials
## Warning: 'trials' should be <= 3 for this object. Predictions generated using 3
## trials
## Warning: 'trials' should be <= 3 for this object. Predictions generated using 3
## trials
## Warning: 'trials' should be <= 9 for this object. Predictions generated using 9
## trials
## Warning: 'trials' should be <= 4 for this object. Predictions generated using 4
## trials
## Warning: 'trials' should be <= 9 for this object. Predictions generated using 9
## trials
## Warning: 'trials' should be <= 8 for this object. Predictions generated using 8
## trials
model_boost
## C5.0 
## 
## 77 samples
##  7 predictor
##  7 classes: 'A', 'B', 'C', 'D', 'E', 'F', 'G' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 69, 69, 68, 70, 67, 71, ... 
## Resampling results across tuning parameters:
## 
##   trials  Accuracy   Kappa    
##    1      0.9438889  0.9266522
##   10      0.8925794  0.8579787
##   20      0.8925794  0.8579787
## 
## Tuning parameter 'model' was held constant at a value of tree
## Tuning
##  parameter 'winnow' was held constant at a value of FALSE
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 1, model = tree and winnow
##  = FALSE.
pred_boost <- predict(model_boost, newdata = test_dat)
confusionMatrix(pred_boost, test_dat$oilType)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction A B C D E F G
##          A 6 0 0 0 0 0 1
##          B 0 5 0 0 0 0 0
##          C 0 0 1 0 0 0 0
##          D 0 0 0 1 0 0 0
##          E 1 0 0 0 2 0 0
##          F 0 0 0 0 0 2 0
##          G 0 0 0 0 0 0 0
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8947         
##                  95% CI : (0.6686, 0.987)
##     No Information Rate : 0.3684         
##     P-Value [Acc > NIR] : 3.089e-06      
##                                          
##                   Kappa : 0.8618         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E Class: F
## Sensitivity            0.8571   1.0000  1.00000  1.00000   1.0000   1.0000
## Specificity            0.9167   1.0000  1.00000  1.00000   0.9412   1.0000
## Pos Pred Value         0.8571   1.0000  1.00000  1.00000   0.6667   1.0000
## Neg Pred Value         0.9167   1.0000  1.00000  1.00000   1.0000   1.0000
## Prevalence             0.3684   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Rate         0.3158   0.2632  0.05263  0.05263   0.1053   0.1053
## Detection Prevalence   0.3684   0.2632  0.05263  0.05263   0.1579   0.1053
## Balanced Accuracy      0.8869   1.0000  1.00000  1.00000   0.9706   1.0000
##                      Class: G
## Sensitivity           0.00000
## Specificity           1.00000
## Pos Pred Value            NaN
## Neg Pred Value        0.94737
## Prevalence            0.05263
## Detection Rate        0.00000
## Detection Prevalence  0.00000
## Balanced Accuracy     0.50000

4(c) Bagging improved the performance of the basic tree. The single tree had a cross-validated accuracy of 0.8499603 and a Kappa of 0.8016579, while the bagged tree increased these values to 0.9260317 and 0.9034891. On the test set, the basic tree achieved an accuracy of 0.8421 and a Kappa of 0.7912, whereas the bagged tree improved to an accuracy of 0.8947 and a Kappa of 0.8618.

For boosting, I used C5.0 models with different numbers of trials. The best C5.0 result was obtained when trials = 1. Increasing the number of boosting trials to 10 or 20 did not improve performance; instead, the resampling accuracy and Kappa became lower. Therefore, bagging improved the tree performance, but boosting did not provide additional improvement in this analysis.

4(d) The best-performing model based on cross-validation was the C5.0 model with trials = 1, model = tree, and winnow = FALSE. This model achieved a cross-validated accuracy of 0.9438889 and a Kappa of 0.9266522. However, on the test set, the bagged tree and the selected C5.0 model had the same performance, with an accuracy of 0.8947 and a Kappa of 0.8618. Overall, the C5.0 model was the strongest model according to the resampling results, while bagging and C5.0 performed equally well on the test set.