1 Exercise 7.1: SVM for a Nonlinear Sine Wave

1.1 Simulate the data

set.seed(100)
x <- runif(100, min = 2, max = 10)
y <- sin(x) + rnorm(length(x)) * 0.25
sinData <- data.frame(x = x, y = y)

dataGrid <- data.frame(x = seq(2, 10, length = 100))

plot(x, y,
     main = "Simulated sine data",
     pch = 19,
     col = "steelblue4")

1.2 7.1(a): Fit RBF SVM models with different cost and epsilon values

cost_values <- c(0.1, 1, 10, 100)
epsilon_values <- c(0.01, 0.1, 0.25)

svm_results <- expand.grid(C = cost_values, epsilon = epsilon_values)

par(mfrow = c(length(epsilon_values), 1), mar = c(4, 4, 3, 1))

for (eps in epsilon_values) {
  plot(sinData$x, sinData$y,
       main = paste("RBF SVM fits, epsilon =", eps, ", automatic sigma"),
       xlab = "x", ylab = "y", pch = 19, col = "steelblue4")

  cols <- c("#1B9E77", "#D95F02", "#7570B3", "#E7298A")
  for (i in seq_along(cost_values)) {
    model <- ksvm(
      x = x,
      y = y,
      data = sinData,
      kernel = "rbfdot",
      kpar = "automatic",
      C = cost_values[i],
      epsilon = eps
    )

    pred <- as.numeric(predict(model, newdata = dataGrid))
    lines(dataGrid$x, pred, col = cols[i], lwd = 2)
  }

  legend("topright", legend = paste("C =", cost_values), col = cols, lwd = 2)
}

par(mfrow = c(1, 1))

1.3 7.1(b): Fit RBF SVM models with different sigma values

sigma_values <- c(0.01, 0.05, 0.1, 0.5, 1, 5)

plot(sinData$x, sinData$y,
     main = "RBF SVM fits with different sigma values",
     xlab = "x", ylab = "y", pch = 19, col = "steelblue4")

sigma_cols <- c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02")

for (i in seq_along(sigma_values)) {
  model <- ksvm(
    x = x,
    y = y,
    data = sinData,
    kernel = "rbfdot",
    kpar = list(sigma = sigma_values[i]),
    C = 1,
    epsilon = 0.1
  )

  pred <- as.numeric(predict(model, newdata = dataGrid))
  lines(dataGrid$x, pred, col = sigma_cols[i], lwd = 2)
}

legend("topright", legend = paste("sigma =", sigma_values), col = sigma_cols, lwd = 2, bty = "n")

1.4 Written answer

The cost parameter C controls the penalty for errors outside the epsilon-insensitive tube. Larger C values usually create more flexible fits because the model tries harder to reduce training error. Smaller C values allow more error and usually produce smoother fits.

The epsilon parameter controls how much deviation from the observed response is ignored. Larger epsilon values usually produce smoother, less sensitive models. Smaller epsilon values make the model respond more closely to individual observations.

The sigma parameter controls the width of the radial basis function kernel. Small sigma values produce smoother, broader influence from support vectors. Large sigma values make each support vector more local, which can create very flexible or wiggly fits.

2 Exercise 7.2: Friedman Benchmark Data

2.1 Simulate training and test data

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

featurePlot(trainingData$x[, 1:5], trainingData$y)

2.2 Set resampling controls

set.seed(200)
ctrl <- trainControl(method = "boot", number = 25)

2.3 KNN model

set.seed(200)
knnModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl
)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.654912  0.4779838  2.958475
##    7  3.529432  0.5118581  2.861742
##    9  3.446330  0.5425096  2.780756
##   11  3.378049  0.5723793  2.719410
##   13  3.332339  0.5953773  2.692863
##   15  3.309235  0.6111389  2.663046
##   17  3.317408  0.6201421  2.678898
##   19  3.311667  0.6333800  2.682098
##   21  3.316340  0.6407537  2.688887
##   23  3.326040  0.6491480  2.705915
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = testData$x)
knnTest <- postResample(pred = knnPred, obs = testData$y)
knnTest
##      RMSE  Rsquared       MAE 
## 3.1750657 0.6785946 2.5443169

2.4 SVM radial model

set.seed(200)
svmRadialModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl
)

svmRadialModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.635010  0.7685188  2.074977
##     0.50  2.423373  0.7839086  1.902162
##     1.00  2.284137  0.8001534  1.791779
##     2.00  2.196624  0.8126474  1.713560
##     4.00  2.143035  0.8209820  1.668024
##     8.00  2.119159  0.8246308  1.649388
##    16.00  2.117440  0.8248675  1.648570
##    32.00  2.117440  0.8248675  1.648570
##    64.00  2.117440  0.8248675  1.648570
##   128.00  2.117440  0.8248675  1.648570
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 16.
svmRadialPred <- predict(svmRadialModel, newdata = testData$x)
svmRadialTest <- postResample(pred = svmRadialPred, obs = testData$y)
svmRadialTest
##      RMSE  Rsquared       MAE 
## 2.0736997 0.8256573 1.5751967

2.5 Neural network model

set.seed(200)
nnetModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "nnet",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trace = FALSE,
  linout = TRUE,
  trControl = ctrl
)

nnetModel
## Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   size  decay         RMSE      Rsquared   MAE     
##    1    0.0000000000  2.857681  0.6723338  2.267740
##    1    0.0001000000  2.611837  0.7336945  2.041339
##    1    0.0002371374  2.652195  0.7248083  2.069900
##    1    0.0005623413  2.692898  0.7137728  2.103952
##    1    0.0013335214  2.779618  0.6946793  2.183050
##    1    0.0031622777  2.643114  0.7256872  2.070798
##    1    0.0074989421  2.835257  0.6793942  2.243404
##    1    0.0177827941  2.631766  0.7320816  2.054204
##    1    0.0421696503  2.641100  0.7266843  2.061686
##    1    0.1000000000  2.607462  0.7347540  2.025355
##    3    0.0000000000  2.912356  0.6859387  2.292561
##    3    0.0001000000  2.912349  0.6862719  2.315246
##    3    0.0002371374  2.993780  0.6711070  2.295366
##    3    0.0005623413  2.952006  0.6739509  2.284218
##    3    0.0013335214  2.985134  0.6763799  2.309017
##    3    0.0031622777  2.757073  0.7126561  2.168808
##    3    0.0074989421  2.957485  0.6769267  2.333248
##    3    0.0177827941  2.868200  0.6880035  2.247632
##    3    0.0421696503  2.864051  0.6919694  2.260468
##    3    0.1000000000  2.836270  0.6962245  2.231301
##    5    0.0000000000  3.507600  0.5923474  2.714945
##    5    0.0001000000  3.429018  0.6036296  2.628053
##    5    0.0002371374  3.576897  0.5858850  2.727089
##    5    0.0005623413  3.461760  0.6152685  2.595539
##    5    0.0013335214  3.484723  0.6006902  2.613098
##    5    0.0031622777  4.014473  0.5821656  2.854185
##    5    0.0074989421  3.538293  0.5894473  2.624627
##    5    0.0177827941  3.501733  0.6068143  2.615091
##    5    0.0421696503  3.349046  0.6169347  2.577578
##    5    0.1000000000  3.082589  0.6601890  2.392424
##    7    0.0000000000  4.115836  0.5567849  3.032943
##    7    0.0001000000  3.895245  0.5409011  2.995755
##    7    0.0002371374  3.943197  0.5395360  2.964994
##    7    0.0005623413  4.040973  0.5126372  2.943177
##    7    0.0013335214  3.897903  0.5469001  2.939066
##    7    0.0031622777  3.860319  0.5531301  2.891862
##    7    0.0074989421  3.964664  0.5596238  2.872539
##    7    0.0177827941  3.978467  0.5579078  2.928113
##    7    0.0421696503  3.589534  0.5939248  2.746895
##    7    0.1000000000  3.558681  0.6041327  2.750068
##    9    0.0000000000  3.612699  0.5746775  2.875637
##    9    0.0001000000  3.587186  0.5782922  2.814552
##    9    0.0002371374  3.761354  0.5666389  2.871148
##    9    0.0005623413  3.646453  0.5739501  2.851603
##    9    0.0013335214  3.646505  0.5699347  2.865126
##    9    0.0031622777  3.556649  0.5991312  2.800482
##    9    0.0074989421  3.713308  0.5707865  2.895456
##    9    0.0177827941  3.594872  0.5840222  2.801455
##    9    0.0421696503  3.636160  0.5589804  2.863139
##    9    0.1000000000  3.344857  0.6181798  2.651709
##   11    0.0000000000  3.600064  0.5686117  2.860076
##   11    0.0001000000  3.625826  0.5612020  2.886747
##   11    0.0002371374  3.592888  0.5730993  2.849931
##   11    0.0005623413  3.484042  0.5890295  2.805283
##   11    0.0013335214  3.563237  0.5692543  2.851637
##   11    0.0031622777  3.585574  0.5643464  2.891371
##   11    0.0074989421  3.485933  0.5841910  2.804457
##   11    0.0177827941  3.371604  0.6070822  2.703692
##   11    0.0421696503  3.410609  0.6099066  2.705478
##   11    0.1000000000  3.210208  0.6342631  2.559457
##   13    0.0000000000  3.635354  0.5704545  2.886886
##   13    0.0001000000  3.526441  0.5698350  2.783507
##   13    0.0002371374  3.513322  0.5826522  2.841020
##   13    0.0005623413  3.623964  0.5628042  2.898225
##   13    0.0013335214  3.595604  0.5751410  2.911485
##   13    0.0031622777  3.457168  0.5912396  2.740058
##   13    0.0074989421  3.389653  0.6100497  2.696152
##   13    0.0177827941  3.334808  0.6107531  2.697179
##   13    0.0421696503  3.274238  0.6203413  2.628778
##   13    0.1000000000  3.180047  0.6337845  2.513425
##   15    0.0000000000  3.465868  0.5856474  2.756788
##   15    0.0001000000  3.477626  0.5984187  2.789750
##   15    0.0002371374  3.681643  0.5556947  2.918260
##   15    0.0005623413  3.560213  0.5734515  2.841638
##   15    0.0013335214  3.466343  0.5832586  2.774515
##   15    0.0031622777  3.396764  0.5976196  2.718497
##   15    0.0074989421  3.413103  0.6050543  2.716613
##   15    0.0177827941  3.256615  0.6235438  2.578352
##   15    0.0421696503  3.220113  0.6311620  2.563574
##   15    0.1000000000  3.005262  0.6666618  2.383601
##   17    0.0000000000  3.476792  0.5895967  2.750491
##   17    0.0001000000  3.426940  0.5938475  2.729022
##   17    0.0002371374  3.417389  0.5933411  2.706695
##   17    0.0005623413  3.395365  0.6037099  2.729314
##   17    0.0013335214  3.458376  0.5901157  2.740308
##   17    0.0031622777  3.354502  0.6033934  2.689892
##   17    0.0074989421  3.279659  0.6092279  2.585494
##   17    0.0177827941  3.171471  0.6376058  2.518292
##   17    0.0421696503  3.019724  0.6686938  2.401347
##   17    0.1000000000  2.832803  0.6976676  2.245190
##   19    0.0000000000  3.390739  0.6089025  2.677523
##   19    0.0001000000  3.216878  0.6293649  2.545302
##   19    0.0002371374  3.304582  0.6224087  2.641450
##   19    0.0005623413  3.297378  0.6177093  2.621346
##   19    0.0013335214  3.217976  0.6350035  2.587980
##   19    0.0031622777  3.196462  0.6305028  2.537275
##   19    0.0074989421  3.120958  0.6495519  2.492043
##   19    0.0177827941  2.951825  0.6791262  2.362497
##   19    0.0421696503  2.861992  0.6971037  2.277578
##   19    0.1000000000  2.798424  0.7058756  2.216845
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.
nnetPred <- predict(nnetModel, newdata = testData$x)
nnetTest <- postResample(pred = nnetPred, obs = testData$y)
nnetTest
##     RMSE Rsquared      MAE 
## 2.649315 0.717721 2.029526

2.6 MARS model

set.seed(200)
marsModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",
  tuneLength = 10,
  trControl = ctrl
)

marsModel
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##    2      4.447045  0.2249607  3.650128
##    3      3.744821  0.4546610  3.019175
##    4      2.828643  0.6892908  2.244131
##    6      2.406670  0.7747079  1.906733
##    7      2.027113  0.8375721  1.594956
##    9      1.800794  0.8728377  1.411703
##   10      1.810047  0.8721377  1.412023
##   12      1.831608  0.8700790  1.430044
##   13      1.839717  0.8686550  1.440537
##   15      1.856211  0.8663787  1.452430
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 9 and degree = 1.
marsPred <- predict(marsModel, newdata = testData$x)
marsTest <- postResample(pred = marsPred, obs = testData$y)
marsTest
##      RMSE  Rsquared       MAE 
## 1.7901760 0.8705315 1.3712537

2.7 Compare test-set performance

friedmanResults <- rbind(
  KNN = knnTest,
  SVM_Radial = svmRadialTest,
  Neural_Network = nnetTest,
  MARS = marsTest
)

friedmanResults
##                    RMSE  Rsquared      MAE
## KNN            3.175066 0.6785946 2.544317
## SVM_Radial     2.073700 0.8256573 1.575197
## Neural_Network 2.649315 0.7177210 2.029526
## MARS           1.790176 0.8705315 1.371254
friedmanResults[order(friedmanResults[, "RMSE"]), ]
##                    RMSE  Rsquared      MAE
## MARS           1.790176 0.8705315 1.371254
## SVM_Radial     2.073700 0.8256573 1.575197
## Neural_Network 2.649315 0.7177210 2.029526
## KNN            3.175066 0.6785946 2.544317

2.8 Does MARS select the informative predictors?

marsModel$finalModel
## Selected 9 of 18 terms, and 6 of 10 predictors (nprune=9)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 8 (additive model)
## GCV 2.759293    RSS 462.0297    GRSq 0.8879716    RSq 0.905262
summary(marsModel$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
##             nprune=9)
## 
##                coefficients
## (Intercept)       21.236556
## h(0.621722-X1)   -10.850227
## h(0.601063-X2)   -10.649871
## h(0.447442-X3)     9.213769
## h(X3-0.636458)    13.532667
## h(0.734892-X4)    -9.675236
## h(X4-0.734892)    10.703568
## h(0.850094-X5)    -5.274161
## h(X6-0.361791)    -1.737028
## 
## Selected 9 of 18 terms, and 6 of 10 predictors (nprune=9)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 8 (additive model)
## GCV 2.759293    RSS 462.0297    GRSq 0.8879716    RSq 0.905262
varImp(marsModel)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   82.92
## X2   64.47
## X5   40.67
## X3   28.65
## X6    0.00
plot(varImp(marsModel), main = "MARS variable importance", col = "#2C7BB6")

2.9 Written answer

After knitting this file, use the test-set RMSE table to identify the best model. The lowest test-set RMSE indicates the best generalization performance.

For the Friedman simulation, the truly informative predictors are X1 through X5; X6 through X10 are noise variables. The MARS model should ideally assign the most importance to variables among X1 through X5. Confirm this using the summary(marsModel$finalModel) output and the variable-importance plot.

3 Exercise 7.3: Tecator Data

3.1 Load and prepare Tecator data

data(tecator)

tecator_x <- as.data.frame(absorp)
if (is.null(colnames(tecator_x))) {
  colnames(tecator_x) <- paste0("X", seq_len(ncol(tecator_x)))
}
tecator_endpoints <- endpoints
if (is.matrix(tecator_endpoints) || is.data.frame(tecator_endpoints)) {
  if (is.null(colnames(tecator_endpoints)) && ncol(tecator_endpoints) == 3) {
    colnames(tecator_endpoints) <- c("Water", "Fat", "Protein")
  }
}

tecator_y <- tecator_endpoints[, "Fat"]

set.seed(300)
tecatorIndex <- createDataPartition(tecator_y, p = 0.75, list = FALSE)
tecatorTrainX <- tecator_x[tecatorIndex, ]
tecatorTrainY <- tecator_y[tecatorIndex]
tecatorTestX <- tecator_x[-tecatorIndex, ]
tecatorTestY <- tecator_y[-tecatorIndex]

tecatorCtrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

The Tecator data shipped in this environment did not include predictor or response names, so the code assigns X1, X2, and so on to the spectra and labels the response columns as Water, Fat, and Protein before modeling. That keeps caret::train() and the response extraction stable across package versions.

3.2 SVM model

set.seed(300)
tecatorSVM <- train(
  x = tecatorTrainX,
  y = tecatorTrainY,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = tecatorCtrl
)

tecatorSVM
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 163 samples
## 100 predictors
## 
## Pre-processing: centered (100), scaled (100) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 147, 147, 146, 147, 145, 147, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE       Rsquared   MAE     
##     0.25  10.133906  0.3926338  7.801238
##     0.50   9.208238  0.4859731  7.035320
##     1.00   7.830577  0.6201989  5.765375
##     2.00   6.795857  0.7059855  4.804159
##     4.00   5.828115  0.7792267  4.023401
##     8.00   5.043870  0.8315865  3.461188
##    16.00   4.418271  0.8710756  3.025120
##    32.00   4.028473  0.8917098  2.720785
##    64.00   3.699880  0.9070696  2.452637
##   128.00   3.427293  0.9186524  2.205700
## 
## Tuning parameter 'sigma' was held constant at a value of 0.03372332
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.03372332 and C = 128.
tecatorSVMPred <- predict(tecatorSVM, newdata = tecatorTestX)
tecatorSVMTest <- postResample(tecatorSVMPred, tecatorTestY)
tecatorSVMTest
##      RMSE  Rsquared       MAE 
## 2.0627442 0.9765407 1.5491585

3.3 Neural network without PCA

set.seed(300)
tecatorNNet <- train(
  x = tecatorTrainX,
  y = tecatorTrainY,
  method = "nnet",
  preProc = c("center", "scale"),
  tuneLength = 10,
  MaxNWts = 5000,
  trace = FALSE,
  linout = TRUE,
  trControl = tecatorCtrl
)

tecatorNNet
## Neural Network 
## 
## 163 samples
## 100 predictors
## 
## Pre-processing: centered (100), scaled (100) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 147, 147, 146, 147, 145, 147, ... 
## Resampling results across tuning parameters:
## 
##   size  decay         RMSE       Rsquared   MAE      
##    1    0.0000000000  3.3748934  0.8489082  2.6300007
##    1    0.0001000000  2.6394036  0.9078711  2.0157079
##    1    0.0002371374  2.5266869  0.9456018  1.9540246
##    1    0.0005623413  1.9783380  0.9703976  1.4926290
##    1    0.0013335214  1.8473728  0.9708264  1.4153430
##    1    0.0031622777  2.1319902  0.9631803  1.6622617
##    1    0.0074989421  1.5985734  0.9847969  1.2267898
##    1    0.0177827941  1.6088586  0.9852214  1.2398776
##    1    0.0421696503  1.7311092  0.9828303  1.3607529
##    1    0.1000000000  1.8806123  0.9796304  1.5095193
##    3    0.0000000000  1.4653855  0.9661746  1.1293525
##    3    0.0001000000  1.5499394  0.9647025  1.1809272
##    3    0.0002371374  1.1886088  0.9907815  0.8730048
##    3    0.0005623413  1.1262054  0.9915832  0.8273566
##    3    0.0013335214  1.3706435  0.9678276  1.0534186
##    3    0.0031622777  1.2717741  0.9839998  0.9956867
##    3    0.0074989421  1.1168124  0.9886866  0.8521549
##    3    0.0177827941  1.1287479  0.9920149  0.8617010
##    3    0.0421696503  0.8875830  0.9949941  0.6737407
##    3    0.1000000000  1.1695287  0.9912084  0.8904107
##    5    0.0000000000  1.2943482  0.9824663  0.9063088
##    5    0.0001000000  1.0881498  0.9930436  0.7728354
##    5    0.0002371374  1.2853898  0.9808752  0.7823308
##    5    0.0005623413  0.7307418  0.9966792  0.5566825
##    5    0.0013335214  1.1396039  0.9913319  0.8574557
##    5    0.0031622777  0.9773469  0.9924227  0.7356896
##    5    0.0074989421  1.0481612  0.9890946  0.7045886
##    5    0.0177827941  1.0273368  0.9921898  0.7916347
##    5    0.0421696503  0.9226952  0.9945474  0.6804591
##    5    0.1000000000  0.8856328  0.9951285  0.6809366
##    7    0.0000000000  1.0730891  0.9913878  0.7483220
##    7    0.0001000000  1.6933225  0.9633338  0.8809912
##    7    0.0002371374  1.7877131  0.9592821  1.0261008
##    7    0.0005623413  1.5605535  0.9689060  1.0745773
##    7    0.0013335214  1.2158770  0.9885347  0.8790020
##    7    0.0031622777  1.0771136  0.9922777  0.7234979
##    7    0.0074989421  0.9953547  0.9923080  0.7225621
##    7    0.0177827941  0.9208951  0.9944338  0.6601241
##    7    0.0421696503  0.9576107  0.9943320  0.6807309
##    7    0.1000000000  0.9314200  0.9944975  0.7053067
##    9    0.0000000000  1.6959917  0.9709044  1.2105240
##    9    0.0001000000  1.8361193  0.9606971  1.1080604
##    9    0.0002371374  1.3061673  0.9862578  0.8609660
##    9    0.0005623413  1.5480473  0.9642395  1.1224330
##    9    0.0013335214  1.3438623  0.9866264  0.9422799
##    9    0.0031622777  1.1240981  0.9928630  0.7681495
##    9    0.0074989421  1.1982378  0.9885432  0.7901904
##    9    0.0177827941  1.0152215  0.9934577  0.7401710
##    9    0.0421696503  0.9509064  0.9937057  0.7060800
##    9    0.1000000000  1.0725768  0.9923821  0.8215494
##   11    0.0000000000  1.6431761  0.9704837  1.0243531
##   11    0.0001000000  1.8330285  0.9737117  1.1441804
##   11    0.0002371374  1.8738840  0.9694080  1.1238777
##   11    0.0005623413  1.3648284  0.9865846  0.9005617
##   11    0.0013335214  1.6449068  0.9657946  1.1993589
##   11    0.0031622777  1.1997724  0.9889180  0.8683719
##   11    0.0074989421  1.4383009  0.9849888  0.9748095
##   11    0.0177827941  1.2400778  0.9856412  0.8754709
##   11    0.0421696503  1.2438852  0.9898267  0.9423621
##   11    0.1000000000  1.0703878  0.9929037  0.8313989
##   13    0.0000000000  1.7811439  0.9805298  1.1073335
##   13    0.0001000000  1.6267135  0.9804671  1.1045071
##   13    0.0002371374  1.8990771  0.9622562  1.2157107
##   13    0.0005623413  1.5712491  0.9801155  1.0493748
##   13    0.0013335214  1.2959990  0.9859978  0.8737195
##   13    0.0031622777  1.5905770  0.9680035  1.0670431
##   13    0.0074989421  1.6698651  0.9785851  1.1423652
##   13    0.0177827941  1.1313058  0.9892363  0.8398519
##   13    0.0421696503  1.3915801  0.9866615  1.0172209
##   13    0.1000000000  1.1301990  0.9909994  0.8573576
##   15    0.0000000000  1.6045866  0.9800353  1.1187889
##   15    0.0001000000  1.9072691  0.9727152  1.2322180
##   15    0.0002371374  1.6946924  0.9731679  1.1225041
##   15    0.0005623413  1.5747062  0.9808483  1.0743365
##   15    0.0013335214  2.1303662  0.9438193  1.1063420
##   15    0.0031622777  1.6095085  0.9792667  1.1175745
##   15    0.0074989421  1.1630414  0.9902363  0.8018904
##   15    0.0177827941  1.0421371  0.9930076  0.7451481
##   15    0.0421696503  1.2229017  0.9909661  0.9033315
##   15    0.1000000000  1.0655111  0.9928298  0.7986673
##   17    0.0000000000  1.7727472  0.9735758  1.0896093
##   17    0.0001000000  1.8998580  0.9733860  1.2595240
##   17    0.0002371374  1.3772725  0.9874607  0.9006951
##   17    0.0005623413  1.5193358  0.9807869  1.0189533
##   17    0.0013335214  1.4856010  0.9794695  0.9595828
##   17    0.0031622777  1.6162294  0.9772975  1.0803924
##   17    0.0074989421  1.3684926  0.9864659  0.9257095
##   17    0.0177827941  1.1302780  0.9920378  0.8173160
##   17    0.0421696503  1.0904178  0.9927844  0.7832132
##   17    0.1000000000  1.0972224  0.9914710  0.8451023
##   19    0.0000000000  1.4777025  0.9802953  0.9872595
##   19    0.0001000000  1.9261393  0.9682555  1.1460864
##   19    0.0002371374  2.2733755  0.9563345  1.4388090
##   19    0.0005623413  1.7799808  0.9760278  1.1156646
##   19    0.0013335214  1.4465048  0.9833817  0.9399811
##   19    0.0031622777  1.3189520  0.9882580  0.8790357
##   19    0.0074989421  1.1950325  0.9894726  0.8622395
##   19    0.0177827941  1.2646222  0.9907562  0.9180830
##   19    0.0421696503  1.1106561  0.9922003  0.8081716
##   19    0.1000000000  1.1544736  0.9916395  0.8787344
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5 and decay = 0.0005623413.
tecatorNNetPred <- predict(tecatorNNet, newdata = tecatorTestX)
tecatorNNetTest <- postResample(tecatorNNetPred, tecatorTestY)
tecatorNNetTest
##      RMSE  Rsquared       MAE 
## 0.7129442 0.9972361 0.5407303

The Tecator spectra are high-dimensional enough that the neural network fit needs a larger MaxNWts setting than the nnet default. Without that adjustment, caret::train() can stop before it evaluates the tuning grid.

3.4 Neural network with PCA

set.seed(300)
tecatorNNetPCA <- train(
  x = tecatorTrainX,
  y = tecatorTrainY,
  method = "nnet",
  preProc = c("center", "scale", "pca"),
  tuneLength = 10,
  MaxNWts = 5000,
  trace = FALSE,
  linout = TRUE,
  trControl = tecatorCtrl
)

tecatorNNetPCA
## Neural Network 
## 
## 163 samples
## 100 predictors
## 
## Pre-processing: centered (100), scaled (100), principal component
##  signal extraction (100) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 147, 147, 146, 147, 145, 147, ... 
## Resampling results across tuning parameters:
## 
##   size  decay         RMSE      Rsquared   MAE      
##    1    0.0000000000  10.90292  0.2674157   8.640002
##    1    0.0001000000  10.88184  0.2819602   8.583462
##    1    0.0002371374  10.86703  0.2744623   8.550238
##    1    0.0005623413  10.73152  0.2915750   8.440223
##    1    0.0013335214  10.84576  0.2809556   8.613525
##    1    0.0031622777  10.80228  0.2760856   8.650480
##    1    0.0074989421  10.70613  0.2847060   8.516100
##    1    0.0177827941  10.98621  0.2690255   8.643927
##    1    0.0421696503  10.87109  0.2760213   8.573873
##    1    0.1000000000  10.73332  0.2926369   8.487704
##    3    0.0000000000  12.39318  0.2511370   9.177504
##    3    0.0001000000  10.72009  0.3103606   8.308195
##    3    0.0002371374  11.16789  0.2722659   8.633842
##    3    0.0005623413  11.29409  0.2719384   8.684062
##    3    0.0013335214  10.87736  0.2901795   8.325882
##    3    0.0031622777  10.87769  0.2820752   8.397501
##    3    0.0074989421  11.19619  0.2515406   8.643258
##    3    0.0177827941  10.62905  0.3091640   8.358867
##    3    0.0421696503  10.75940  0.3120900   8.443146
##    3    0.1000000000  11.03146  0.2771992   8.518716
##    5    0.0000000000  11.01003  0.2859419   8.536227
##    5    0.0001000000  11.15461  0.2678209   8.654611
##    5    0.0002371374  10.92484  0.2880913   8.481409
##    5    0.0005623413  10.94343  0.3121318   8.577733
##    5    0.0013335214  11.32780  0.2821218   8.605204
##    5    0.0031622777  11.10276  0.2878608   8.599919
##    5    0.0074989421  11.23473  0.2764125   8.721918
##    5    0.0177827941  11.03836  0.2850252   8.659015
##    5    0.0421696503  11.05339  0.2856332   8.470255
##    5    0.1000000000  10.84979  0.2918739   8.310393
##    7    0.0000000000  13.76395  0.1905671   9.734513
##    7    0.0001000000  11.73203  0.2407999   8.867868
##    7    0.0002371374  11.75056  0.2854953   8.840708
##    7    0.0005623413  11.61873  0.2705597   8.875471
##    7    0.0013335214  12.58453  0.2421046   9.338730
##    7    0.0031622777  11.91488  0.2213087   9.087011
##    7    0.0074989421  11.61841  0.2706148   8.892620
##    7    0.0177827941  11.34197  0.2650089   8.798185
##    7    0.0421696503  11.34508  0.2664991   8.659684
##    7    0.1000000000  11.51137  0.2562407   8.975498
##    9    0.0000000000  13.50296  0.2412815   9.587457
##    9    0.0001000000  11.50634  0.2631705   8.765976
##    9    0.0002371374  12.18666  0.2461314   9.207804
##    9    0.0005623413  12.43376  0.2319100   9.542483
##    9    0.0013335214  11.94986  0.2488685   9.219970
##    9    0.0031622777  11.69691  0.2494115   8.988400
##    9    0.0074989421  11.98387  0.2350277   9.049897
##    9    0.0177827941  11.83265  0.2435747   8.950579
##    9    0.0421696503  11.82992  0.2719722   8.948015
##    9    0.1000000000  11.25369  0.2896718   8.594318
##   11    0.0000000000  12.46809  0.2676595   9.314241
##   11    0.0001000000  13.99256  0.2219390   9.910948
##   11    0.0002371374  11.45315  0.3022936   8.697976
##   11    0.0005623413  12.26751  0.2303302   9.446110
##   11    0.0013335214  12.67345  0.2166041   9.472723
##   11    0.0031622777  11.67908  0.2722234   8.956365
##   11    0.0074989421  12.18195  0.2495960   9.131475
##   11    0.0177827941  12.81600  0.2320390   9.469484
##   11    0.0421696503  12.02092  0.2353188   9.055371
##   11    0.1000000000  11.75659  0.2587794   8.849204
##   13    0.0000000000  14.76678  0.2107805  10.013371
##   13    0.0001000000  15.29393  0.2253348  10.484882
##   13    0.0002371374  13.43785  0.2413870   9.724509
##   13    0.0005623413  13.32054  0.2309030   9.595126
##   13    0.0013335214  12.03849  0.2732607   9.147005
##   13    0.0031622777  12.06730  0.2655597   9.206528
##   13    0.0074989421  12.57171  0.2377805   9.427711
##   13    0.0177827941  12.10552  0.2605073   9.177575
##   13    0.0421696503  12.19018  0.2462055   9.338889
##   13    0.1000000000  11.42890  0.2827441   8.600421
##   15    0.0000000000  12.74924  0.2569147   9.695026
##   15    0.0001000000  13.72333  0.2447829   9.942559
##   15    0.0002371374  12.87996  0.2181036   9.451581
##   15    0.0005623413  12.39837  0.2634143   9.445597
##   15    0.0013335214  13.74406  0.2083712   9.912984
##   15    0.0031622777  13.53585  0.2323961   9.851361
##   15    0.0074989421  12.57301  0.2492382   9.330538
##   15    0.0177827941  12.33831  0.2606713   9.349841
##   15    0.0421696503  12.39129  0.2612789   9.274948
##   15    0.1000000000  11.75499  0.2736104   8.834722
##   17    0.0000000000  16.39268  0.1963940  11.019996
##   17    0.0001000000  15.15635  0.1961516  10.744496
##   17    0.0002371374  12.26769  0.2829626   9.406098
##   17    0.0005623413  13.45047  0.2246680   9.876215
##   17    0.0013335214  14.15363  0.2274107  10.266294
##   17    0.0031622777  13.53779  0.2266599   9.889848
##   17    0.0074989421  12.50609  0.2568511   9.238037
##   17    0.0177827941  12.94703  0.2399308   9.702571
##   17    0.0421696503  12.50475  0.2698397   9.331555
##   17    0.1000000000  12.19839  0.2685989   9.186762
##   19    0.0000000000  15.05436  0.2068622  10.702608
##   19    0.0001000000  12.35491  0.2777730   9.214917
##   19    0.0002371374  13.14056  0.2544752   9.530942
##   19    0.0005623413  13.21093  0.2352333   9.775023
##   19    0.0013335214  14.06558  0.2198226   9.994912
##   19    0.0031622777  13.62012  0.2539054   9.846126
##   19    0.0074989421  12.69791  0.2665711   9.525719
##   19    0.0177827941  13.29494  0.2263279   9.950539
##   19    0.0421696503  12.01157  0.2719744   8.990127
##   19    0.1000000000  12.40414  0.2502822   9.294977
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 3 and decay = 0.01778279.
tecatorNNetPCAPred <- predict(tecatorNNetPCA, newdata = tecatorTestX)
tecatorNNetPCATest <- postResample(tecatorNNetPCAPred, tecatorTestY)
tecatorNNetPCATest
##       RMSE   Rsquared        MAE 
## 11.7135136  0.2223439  9.3315084

3.5 MARS model

set.seed(300)
tecatorMARS <- train(
  x = tecatorTrainX,
  y = tecatorTrainY,
  method = "earth",
  tuneLength = 10,
  trControl = tecatorCtrl
)

tecatorMARS
## Multivariate Adaptive Regression Spline 
## 
## 163 samples
## 100 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 147, 147, 146, 147, 145, 147, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE       Rsquared   MAE     
##    2      10.462357  0.3176694  8.437541
##    3       9.815835  0.4142557  7.690837
##    4       6.756469  0.7244056  4.941173
##    5       4.363844  0.8907567  3.193900
##    6       3.900484  0.9150205  2.980521
##    7       3.032307  0.9512811  2.462839
##    8       2.953161  0.9531898  2.361931
##    9       2.772357  0.9567941  2.145741
##   10       2.607950  0.9623412  1.970821
##   11       2.592211  0.9632178  1.940826
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 11 and degree = 1.
tecatorMARSPred <- predict(tecatorMARS, newdata = tecatorTestX)
tecatorMARSTest <- postResample(tecatorMARSPred, tecatorTestY)
tecatorMARSTest
##      RMSE  Rsquared       MAE 
## 3.0218533 0.9493355 2.1957926

3.6 KNN model

set.seed(300)
tecatorKNN <- train(
  x = tecatorTrainX,
  y = tecatorTrainY,
  method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 20,
  trControl = tecatorCtrl
)

tecatorKNN
## k-Nearest Neighbors 
## 
## 163 samples
## 100 predictors
## 
## Pre-processing: centered (100), scaled (100) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 147, 147, 146, 147, 145, 147, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE     
##    5   8.967288  0.4977218  6.896029
##    7   8.984129  0.4970002  7.061153
##    9   9.184783  0.4844418  7.273277
##   11   9.468309  0.4545029  7.600409
##   13   9.663517  0.4335040  7.832783
##   15   9.783786  0.4241510  8.009063
##   17  10.035280  0.3880472  8.215944
##   19  10.118732  0.3792420  8.304211
##   21  10.280600  0.3539041  8.418427
##   23  10.520583  0.3183199  8.587498
##   25  10.653069  0.3004806  8.681427
##   27  10.754285  0.2867677  8.730408
##   29  10.880162  0.2684256  8.796776
##   31  10.936013  0.2597130  8.836808
##   33  10.934395  0.2599123  8.843391
##   35  10.897764  0.2671199  8.858113
##   37  10.887659  0.2705677  8.877905
##   39  10.887854  0.2696809  8.888192
##   41  10.871537  0.2723960  8.877390
##   43  10.838769  0.2774184  8.865553
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
tecatorKNNPred <- predict(tecatorKNN, newdata = tecatorTestX)
tecatorKNNTest <- postResample(tecatorKNNPred, tecatorTestY)
tecatorKNNTest
##       RMSE   Rsquared        MAE 
## 10.3642653  0.3924421  8.3836538

3.7 Compare Tecator results

tecatorResults <- rbind(
  SVM_Radial = tecatorSVMTest,
  Neural_Network = tecatorNNetTest,
  Neural_Network_PCA = tecatorNNetPCATest,
  MARS = tecatorMARSTest,
  KNN = tecatorKNNTest
)

tecatorResults
##                          RMSE  Rsquared       MAE
## SVM_Radial          2.0627442 0.9765407 1.5491585
## Neural_Network      0.7129442 0.9972361 0.5407303
## Neural_Network_PCA 11.7135136 0.2223439 9.3315084
## MARS                3.0218533 0.9493355 2.1957926
## KNN                10.3642653 0.3924421 8.3836538
tecatorResults[order(tecatorResults[, "RMSE"]), ]
##                          RMSE  Rsquared       MAE
## Neural_Network      0.7129442 0.9972361 0.5407303
## SVM_Radial          2.0627442 0.9765407 1.5491585
## MARS                3.0218533 0.9493355 2.1957926
## KNN                10.3642653 0.3924421 8.3836538
## Neural_Network_PCA 11.7135136 0.2223439 9.3315084

3.8 Written answer

Compare the neural network model with and without PCA. If the PCA model has lower test RMSE or more stable resampling performance, PCA helped reduce the effect of highly correlated spectral predictors. If performance is similar or worse, the neural network tuning process may have handled the correlation adequately, or the PCA transformation may have discarded useful predictive detail.

4 Exercise 7.4: Permeability Data from Exercise 6.2

This section uses the permeability data from the AppliedPredictiveModeling package.

4.1 Load and split data

data(permeability)

perm_x <- as.data.frame(fingerprints)
if (is.null(colnames(perm_x))) {
  colnames(perm_x) <- paste0("X", seq_len(ncol(perm_x)))
}
perm_y <- permeability

set.seed(400)
permIndex <- createDataPartition(perm_y, p = 0.75, list = FALSE)
permTrainX <- perm_x[permIndex, ]
permTrainY <- perm_y[permIndex]
permTestX <- perm_x[-permIndex, ]
permTestY <- perm_y[-permIndex]

permCtrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

The permeability predictors are also converted to a named data frame before fitting. This avoids the same caret error we hit in Tecator when a matrix arrives without column names.

4.2 Train nonlinear models

set.seed(400)
permKNN <- train(
  x = permTrainX,
  y = permTrainY,
  method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 20,
  trControl = permCtrl
)

set.seed(400)
permSVM <- train(
  x = permTrainX,
  y = permTrainY,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = permCtrl
)

set.seed(400)
permMARS <- train(
  x = permTrainX,
  y = permTrainY,
  method = "earth",
  tuneLength = 10,
  trControl = permCtrl
)

set.seed(400)
permNNet <- train(
  x = permTrainX,
  y = permTrainY,
  method = "nnet",
  preProc = c("center", "scale"),
  tuneLength = 10,
  MaxNWts = 5000,
  trace = FALSE,
  linout = TRUE,
  trControl = permCtrl
)

The permeability fingerprints are also high-dimensional, so the neural network uses a larger MaxNWts budget to keep the tuning run from failing during resampling.

4.3 Evaluate nonlinear models

permPreds <- list(
  KNN = predict(permKNN, permTestX),
  SVM_Radial = predict(permSVM, permTestX),
  MARS = predict(permMARS, permTestX),
  Neural_Network = predict(permNNet, permTestX)
)

permResults <- do.call(rbind, lapply(permPreds, postResample, obs = permTestY))
permResults
##                    RMSE  Rsquared      MAE
## KNN            10.48300 0.5085284 6.943112
## SVM_Radial     11.09384 0.4587844 7.977576
## MARS           12.51146 0.2931987 7.495023
## Neural_Network 11.76658 0.3760560 8.080520
permResults[order(permResults[, "RMSE"]), ]
##                    RMSE  Rsquared      MAE
## KNN            10.48300 0.5085284 6.943112
## SVM_Radial     11.09384 0.4587844 7.977576
## Neural_Network 11.76658 0.3760560 8.080520
## MARS           12.51146 0.2931987 7.495023
resamples(list(
  KNN = permKNN,
  SVM_Radial = permSVM,
  MARS = permMARS,
  Neural_Network = permNNet
)) %>% summary()
## 
## Call:
## summary.resamples(object = .)
## 
## Models: KNN, SVM_Radial, MARS, Neural_Network 
## Number of resamples: 30 
## 
## MAE 
##                    Min.  1st Qu.   Median     Mean   3rd Qu.     Max. NA's
## KNN            5.305333 7.270375 8.405028 8.810520  9.552176 16.61422    0
## SVM_Radial     4.030748 6.351954 7.016820 7.788855  8.810715 13.88820    0
## MARS           3.666791 6.454053 8.000424 8.338561  9.822836 14.74815    0
## Neural_Network 4.616390 8.256806 9.405143 9.626324 11.108314 17.50329    0
## 
## RMSE 
##                    Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## KNN            8.242412 10.221147 11.92331 12.60804 14.77713 22.63340    0
## SVM_Radial     4.736495  9.272856 11.22163 11.58015 14.00103 19.37903    0
## MARS           5.926957  9.152803 12.01525 12.19143 14.26787 19.93362    0
## Neural_Network 6.691102 11.175592 12.30595 12.84158 14.63003 21.95674    0
## 
## Rsquared 
##                       Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## KNN            0.008046846 0.2602728 0.5208523 0.4553927 0.6419539 0.8105656
## SVM_Radial     0.001800984 0.2913908 0.4964439 0.5001240 0.7163796 0.9351543
## MARS           0.015046300 0.2757843 0.4599600 0.4678860 0.6635779 0.8927610
## Neural_Network 0.017237335 0.2452222 0.3899458 0.4000434 0.5741837 0.7852710
##                NA's
## KNN               0
## SVM_Radial        0
## MARS              0
## Neural_Network    0

4.4 Written answer

  1. The optimal nonlinear model is the model with the lowest test-set RMSE and competitive resampling RMSE.

  2. Compare the best nonlinear model to the best linear model from Exercise 6.2. If a nonlinear model performs better, this suggests the relationship between the predictors and permeability may include nonlinearities or interactions that the linear model did not capture.

  3. A replacement for the laboratory experiment should only be recommended if the model has sufficiently low error for the scientific or business use case, is stable across resampling, and performs well on the test set. Predictive performance alone may not be enough; model reliability, domain validation, and acceptable error thresholds are also required.

5 Exercise 7.5: Chemical Manufacturing Process Data from Exercise 6.3

This section uses the Chemical Manufacturing Process data from the AppliedPredictiveModeling package.

5.1 Load data, impute, split, and preprocess

data(ChemicalManufacturingProcess)

chemData <- ChemicalManufacturingProcess

# The response is Yield. Predictors include biological and process variables.
if ("yield" %in% names(chemData) && !"Yield" %in% names(chemData)) {
  chemData <- chemData %>% mutate(Yield = yield)
  chemData$yield <- NULL
} else if (!"Yield" %in% names(chemData)) {
  stop("ChemicalManufacturingProcess must contain a Yield or yield column.")
}

set.seed(500)
chemIndex <- createDataPartition(chemData$Yield, p = 0.75, list = FALSE)
chemTrain <- chemData[chemIndex, ]
chemTest <- chemData[-chemIndex, ]

chemTrainX <- chemTrain %>% select(-Yield)
chemTrainY <- chemTrain$Yield
chemTestX <- chemTest %>% select(-Yield)
chemTestY <- chemTest$Yield

# Median imputation and near-zero variance filtering are included here.
chemPP <- preProcess(chemTrainX, method = c("medianImpute", "center", "scale"))
chemTrainXpp <- predict(chemPP, chemTrainX)
chemTestXpp <- predict(chemPP, chemTestX)

nzv <- nearZeroVar(chemTrainXpp)
if (length(nzv) > 0) {
  chemTrainXpp <- chemTrainXpp[, -nzv]
  chemTestXpp <- chemTestXpp[, -nzv]
}

chemCtrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

This section also includes a small compatibility guard for the response name. Some installations expose the chemical manufacturing target as yield, so the code renames it to Yield and continues with a single consistent response name throughout the analysis.

5.2 Train nonlinear models

set.seed(500)
chemKNN <- train(
  x = chemTrainXpp,
  y = chemTrainY,
  method = "knn",
  tuneLength = 20,
  trControl = chemCtrl
)

set.seed(500)
chemSVM <- train(
  x = chemTrainXpp,
  y = chemTrainY,
  method = "svmRadial",
  tuneLength = 10,
  trControl = chemCtrl
)

set.seed(500)
chemMARS <- train(
  x = chemTrainXpp,
  y = chemTrainY,
  method = "earth",
  tuneLength = 10,
  trControl = chemCtrl
)

set.seed(500)
chemNNet <- train(
  x = chemTrainXpp,
  y = chemTrainY,
  method = "nnet",
  tuneLength = 10,
  MaxNWts = 5000,
  trace = FALSE,
  linout = TRUE,
  trControl = chemCtrl
)

The chemical manufacturing predictor matrix is wide enough that the neural network also needs a larger MaxNWts limit here.

5.3 Evaluate nonlinear models

chemPreds <- list(
  KNN = predict(chemKNN, chemTestXpp),
  SVM_Radial = predict(chemSVM, chemTestXpp),
  MARS = predict(chemMARS, chemTestXpp),
  Neural_Network = predict(chemNNet, chemTestXpp)
)

chemResults <- do.call(rbind, lapply(chemPreds, postResample, obs = chemTestY))
chemResults
##                    RMSE  Rsquared       MAE
## KNN            1.487910 0.4049052 1.1788182
## SVM_Radial     1.028066 0.7051389 0.8658136
## MARS           1.168513 0.6386169 0.9598293
## Neural_Network 1.345515 0.5033014 1.1235290
chemResults[order(chemResults[, "RMSE"]), ]
##                    RMSE  Rsquared       MAE
## SVM_Radial     1.028066 0.7051389 0.8658136
## MARS           1.168513 0.6386169 0.9598293
## Neural_Network 1.345515 0.5033014 1.1235290
## KNN            1.487910 0.4049052 1.1788182

5.4 Variable importance for the best nonlinear model

# Replace this with the object name of the best model after reviewing chemResults.
bestChemModel <- chemSVM

chemImportance <- varImp(bestChemModel)
chemImportance
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess32   99.92
## BiologicalMaterial06     91.25
## ManufacturingProcess17   87.24
## BiologicalMaterial03     73.21
## ManufacturingProcess09   72.18
## ManufacturingProcess31   71.25
## ManufacturingProcess36   70.95
## BiologicalMaterial12     70.57
## BiologicalMaterial02     66.75
## ManufacturingProcess06   56.29
## BiologicalMaterial01     49.89
## ManufacturingProcess30   48.10
## BiologicalMaterial11     47.99
## BiologicalMaterial04     45.92
## BiologicalMaterial08     45.65
## ManufacturingProcess12   44.09
## ManufacturingProcess29   42.73
## ManufacturingProcess04   39.44
## ManufacturingProcess11   36.92
plot(chemImportance, top = 20, main = "Top variable importance: best nonlinear model", col = "#2C7BB6")

5.5 Explore top predictors

importance_df <- chemImportance$importance %>%
  tibble::rownames_to_column("Predictor") %>%
  arrange(desc(Overall))

top_predictors <- head(importance_df$Predictor, 10)
top_predictors
##  [1] "ManufacturingProcess13" "ManufacturingProcess32" "BiologicalMaterial06"  
##  [4] "ManufacturingProcess17" "BiologicalMaterial03"   "ManufacturingProcess09"
##  [7] "ManufacturingProcess31" "ManufacturingProcess36" "BiologicalMaterial12"  
## [10] "BiologicalMaterial02"
plot_data <- chemTrainXpp %>%
  select(all_of(top_predictors)) %>%
  mutate(Yield = chemTrainY) %>%
  pivot_longer(cols = all_of(top_predictors), names_to = "Predictor", values_to = "Value")

ggplot(plot_data, aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.55, color = "#2C7BB6") +
  geom_smooth(method = "loess", se = FALSE, color = "#D7191C", linewidth = 0.7) +
  facet_wrap(~ Predictor, scales = "free_x") +
  theme_classic(base_size = 11) +
  labs(title = "Relationships between top predictors and yield")

5.6 Written answer

  1. The optimal nonlinear regression model is the model with the lowest test-set RMSE, while also showing strong and stable resampling performance.

  2. Use varImp(bestChemModel) to identify the most important predictors. Compare the names of the top predictors to determine whether biological or process variables dominate. Then compare these top ten predictors to the top ten predictors from the optimal linear model from Exercise 6.3.

  3. For predictors that appear in the nonlinear model but not in the linear model, inspect the faceted loess plots. Curved patterns, thresholds, or interactions can explain why nonlinear models may find these predictors useful even when linear models did not rank them highly.

6 Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_1.3.2                     dplyr_1.2.0                    
##  [3] AppliedPredictiveModeling_1.1-7 pls_2.9-0                      
##  [5] nnet_7.3-20                     earth_5.3.5                    
##  [7] plotmo_3.7.0                    plotrix_3.8-14                 
##  [9] Formula_1.2-5                   mlbench_2.1-7                  
## [11] kernlab_0.9-33                  caret_7.0-1                    
## [13] lattice_0.22-7                  ggplot2_4.0.2                  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4052.112    farver_2.1.2        
##  [4] S7_0.2.1             fastmap_1.2.0        pROC_1.19.0.1       
##  [7] digest_0.6.39        rpart_4.1.24         timechange_0.4.0    
## [10] lifecycle_1.0.5      cluster_2.1.8.2      survival_3.8-6      
## [13] magrittr_2.0.4       compiler_4.5.2       rlang_1.1.7         
## [16] sass_0.4.10          tools_4.5.2          yaml_2.3.12         
## [19] data.table_1.18.2.1  knitr_1.51           labeling_0.4.3      
## [22] plyr_1.8.9           RColorBrewer_1.1-3   withr_3.0.2         
## [25] purrr_1.2.1          grid_4.5.2           stats4_4.5.2        
## [28] future_1.69.0        globals_0.19.0       scales_1.4.0        
## [31] iterators_1.0.14     MASS_7.3-65          cli_3.6.5           
## [34] ellipse_0.5.0        rmarkdown_2.30       generics_0.1.4      
## [37] otel_0.2.0           future.apply_1.20.1  reshape2_1.4.5      
## [40] cachem_1.1.0         stringr_1.6.0        splines_4.5.2       
## [43] parallel_4.5.2       vctrs_0.7.1          hardhat_1.4.2       
## [46] Matrix_1.7-4         jsonlite_2.0.0       listenv_0.10.0      
## [49] foreach_1.5.2        gower_1.0.2          jquerylib_0.1.4     
## [52] recipes_1.3.1        glue_1.8.0           parallelly_1.46.1   
## [55] codetools_0.2-20     lubridate_1.9.5      stringi_1.8.7       
## [58] gtable_0.3.6         rpart.plot_3.1.4     tibble_3.3.1        
## [61] CORElearn_1.57.3.1   pillar_1.11.1        htmltools_0.5.9     
## [64] ipred_0.9-15         lava_1.8.2           R6_2.6.1            
## [67] evaluate_1.0.5       bslib_0.10.0         class_7.3-23        
## [70] Rcpp_1.1.1           nlme_3.1-168         prodlim_2025.04.28  
## [73] mgcv_1.9-4           xfun_0.56            ModelMetrics_1.2.2.2
## [76] pkgconfig_2.0.3