Problem 7.2 – Friedman Benchmark Data

Setup & Data Generation

library(mlbench)
library(caret)
library(earth)       # MARS
library(kernlab)     # SVM
library(ggplot2)
library(dplyr)

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Exploratory Feature Plot

featurePlot(trainingData$x, trainingData$y, between = list(x = 1, y = 1))

The feature plot shows that X1 through X5 appear to have structured relationships with y, while X6X10 look flat — consistent with them being noise predictors.

Model 1: KNN (given baseline)

set.seed(200)
knnModel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.654912  0.4779838  2.958475
##    7  3.529432  0.5118581  2.861742
##    9  3.446330  0.5425096  2.780756
##   11  3.378049  0.5723793  2.719410
##   13  3.332339  0.5953773  2.692863
##   15  3.309235  0.6111389  2.663046
##   17  3.317408  0.6201421  2.678898
##   19  3.311667  0.6333800  2.682098
##   21  3.316340  0.6407537  2.688887
##   23  3.326040  0.6491480  2.705915
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = testData$x)
knn_perf <- postResample(pred = knnPred, obs = testData$y)
knn_perf
##      RMSE  Rsquared       MAE 
## 3.1750657 0.6785946 2.5443169

Model 2: MARS

set.seed(200)
marsGrid <- expand.grid(degree = 1:2, nprune = seq(2, 20, by = 2))
marsModel <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv", number = 10))
marsModel
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.188280  0.3042527  3.460689
##   1        4      2.653143  0.7167280  2.128222
##   1        6      2.295006  0.7754603  1.853199
##   1        8      1.647182  0.8774867  1.299564
##   1       10      1.635035  0.8798236  1.309436
##   1       12      1.571561  0.8898750  1.253077
##   1       14      1.571673  0.8909652  1.245508
##   1       16      1.571673  0.8909652  1.245508
##   1       18      1.571673  0.8909652  1.245508
##   1       20      1.571673  0.8909652  1.245508
##   2        2      4.188280  0.3042527  3.460689
##   2        4      2.615256  0.7216809  2.128763
##   2        6      2.275048  0.7762472  1.807779
##   2        8      1.641647  0.8839822  1.288520
##   2       10      1.473254  0.9101555  1.158761
##   2       12      1.285380  0.9283193  1.033426
##   2       14      1.261797  0.9327541  1.009821
##   2       16      1.270858  0.9322465  1.009757
##   2       18      1.263778  0.9327687  1.007653
##   2       20      1.263778  0.9327687  1.007653
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
marsPred <- predict(marsModel, newdata = testData$x)
mars_perf <- postResample(pred = marsPred, obs = testData$y)
mars_perf
##      RMSE  Rsquared       MAE 
## 1.1722635 0.9448890 0.9324923

MARS Variable Importance

marsImp <- varImp(marsModel, scale = FALSE)
plot(marsImp, main = "MARS – Variable Importance")

Model 3: Support Vector Machine (Radial)

set.seed(200)
svmModel <- train(x = trainingData$x,
                  y = trainingData$y,
                  method = "svmRadial",
                  preProc = c("center", "scale"),
                  tuneLength = 10,
                  trControl = trainControl(method = "cv", number = 10))
svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.525164  0.7810576  2.010680
##     0.50  2.270567  0.7944850  1.794902
##     1.00  2.099319  0.8155594  1.659342
##     2.00  2.005858  0.8302852  1.578799
##     4.00  1.934650  0.8435677  1.528373
##     8.00  1.915653  0.8475592  1.528614
##    16.00  1.923884  0.8463090  1.535976
##    32.00  1.923884  0.8463090  1.535976
##    64.00  1.923884  0.8463090  1.535976
##   128.00  1.923884  0.8463090  1.535976
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 8.
svmPred <- predict(svmModel, newdata = testData$x)
svm_perf <- postResample(pred = svmPred, obs = testData$y)
svm_perf
##      RMSE  Rsquared       MAE 
## 2.0541197 0.8290353 1.5586411

Model 4: Neural Network (averaged)

set.seed(200)
nnetGrid <- expand.grid(size = c(3, 5, 7), decay = c(0, 0.01, 0.1), bag = FALSE)
nnetModel <- train(x = trainingData$x,
                   y = trainingData$y,
                   method = "avNNet",
                   preProc = c("center", "scale"),
                   tuneGrid = nnetGrid,
                   trControl = trainControl(method = "cv", number = 10),
                   linout = TRUE,
                   trace = FALSE,
                   maxit = 500)
nnetModel
## Model Averaged Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE      Rsquared   MAE     
##   3     0.00   2.085707  0.8073166  1.680859
##   3     0.01   2.078996  0.8214700  1.674115
##   3     0.10   2.125797  0.8077971  1.673049
##   5     0.00   2.006550  0.8292321  1.562333
##   5     0.01   1.961361  0.8340717  1.546856
##   5     0.10   2.130631  0.8160619  1.690481
##   7     0.00   5.236178  0.5406043  3.289442
##   7     0.01   2.340310  0.7874806  1.926235
##   7     0.10   2.110541  0.8091507  1.681723
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5, decay = 0.01 and bag = FALSE.
nnetPred <- predict(nnetModel, newdata = testData$x)
nnet_perf <- postResample(pred = nnetPred, obs = testData$y)
nnet_perf
##      RMSE  Rsquared       MAE 
## 2.0571872 0.8334507 1.5599448

Performance Comparison

results_72 <- data.frame(
  Model    = c("KNN", "MARS", "SVM (Radial)", "Neural Network"),
  RMSE     = c(knn_perf["RMSE"], mars_perf["RMSE"],
               svm_perf["RMSE"], nnet_perf["RMSE"]),
  Rsquared = c(knn_perf["Rsquared"], mars_perf["Rsquared"],
               svm_perf["Rsquared"], nnet_perf["Rsquared"])
)
results_72 <- results_72 %>% arrange(RMSE)
results_72
ggplot(results_72, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = round(RMSE, 3)), vjust = -0.4, size = 3.5) +
  labs(title = "Test Set RMSE by Model (Problem 7.2)",
       x = "Model", y = "RMSE") +
  theme_minimal()

Conclusions for 7.2

Best model: MARS achieves the lowest test RMSE and highest R² among the four models, followed closely by SVM.

Does MARS select the informative predictors?
Yes. The variable importance plot shows that MARS assigns nonzero importance exclusively (or overwhelmingly) to X1X5 — the five predictors that actually appear in the data-generating equation — and assigns near-zero or zero importance to the noise variables X6X10. This confirms that MARS successfully identifies the informative predictors through its automatic feature selection mechanism (hinge function pruning via GCV).


Problem 7.5 – Chemical Manufacturing Process

This problem uses the same preprocessing from Exercise 6.3: imputation, train/test split, and centering/scaling.

Setup & Data Loading

library(AppliedPredictiveModeling)
library(caret)
library(earth)
library(kernlab)

data(ChemicalManufacturingProcess)

# Separate predictors and response
yield <- ChemicalManufacturingProcess$Yield
predictors <- ChemicalManufacturingProcess[, -which(names(ChemicalManufacturingProcess) == "Yield")]

cat("Dimensions:", nrow(predictors), "rows,", ncol(predictors), "predictors\n")
## Dimensions: 176 rows, 57 predictors
cat("Missing values:", sum(is.na(predictors)), "\n")
## Missing values: 106

Preprocessing: Imputation, Split, Centering/Scaling

set.seed(624)

# Train/test split (80/20)
trainIndex <- createDataPartition(yield, p = 0.8, list = FALSE)
trainX <- predictors[trainIndex, ]
testX  <- predictors[-trainIndex, ]
trainY <- yield[trainIndex]
testY  <- yield[-trainIndex]

# Pre-processing object: knnImpute + center + scale + remove near-zero variance
preProc <- preProcess(trainX,
                      method = c("knnImpute", "center", "scale", "nzv"))
trainX_pp <- predict(preProc, trainX)
testX_pp  <- predict(preProc, testX)

cat("Post-processing: ", ncol(trainX_pp), "predictors retained\n")
## Post-processing:  56 predictors retained

Model Training

We train four nonlinear models using 10-fold cross-validation.

ctrl <- trainControl(method = "cv", number = 10)

MARS

set.seed(624)
marsGrid75 <- expand.grid(degree = 1:2, nprune = seq(2, 20, by = 2))
marsModel75 <- train(x = trainX_pp, y = trainY,
                     method = "earth",
                     tuneGrid = marsGrid75,
                     trControl = ctrl)
marsModel75
## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 130, 130, 130, 129, 130, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      1.390625  0.4135889  1.0998502
##   1        4      1.238005  0.5465843  1.0186226
##   1        6      1.313102  0.5215943  1.0773153
##   1        8      1.351566  0.5000706  1.1174614
##   1       10      1.331360  0.5229992  1.0827653
##   1       12      1.351705  0.5194064  1.0811689
##   1       14      1.375461  0.5327304  1.1030033
##   1       16      1.373381  0.5352340  1.1068554
##   1       18      1.373381  0.5352340  1.1068554
##   1       20      1.373381  0.5352340  1.1068554
##   2        2      1.365801  0.4302215  1.0885493
##   2        4      1.238130  0.5438861  0.9995495
##   2        6      1.214023  0.5608052  0.9808794
##   2        8      1.118244  0.6192953  0.9267876
##   2       10      1.232499  0.5667857  0.9764749
##   2       12      1.260011  0.5524708  0.9997977
##   2       14      1.320436  0.5373915  1.0438398
##   2       16      1.304633  0.5397978  1.0044578
##   2       18      1.361062  0.4989883  1.0532158
##   2       20      1.399277  0.4848048  1.0636171
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 8 and degree = 2.
marsPred75 <- predict(marsModel75, newdata = testX_pp)
mars_perf75 <- postResample(pred = marsPred75, obs = testY)
mars_perf75
##      RMSE  Rsquared       MAE 
## 1.1715413 0.7042263 0.8826463

KNN

set.seed(624)
knnModel75 <- train(x = trainX_pp, y = trainY,
                    method = "knn",
                    tuneLength = 10,
                    trControl = ctrl)
knnModel75
## k-Nearest Neighbors 
## 
## 144 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 130, 130, 130, 129, 130, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.320716  0.4806512  1.086635
##    7  1.328711  0.4878708  1.101248
##    9  1.343512  0.4673563  1.126381
##   11  1.350832  0.4594141  1.131078
##   13  1.377998  0.4440056  1.151393
##   15  1.376341  0.4518094  1.139419
##   17  1.390919  0.4459126  1.152239
##   19  1.378572  0.4739808  1.131529
##   21  1.385286  0.4762743  1.144731
##   23  1.411143  0.4490627  1.160653
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
knnPred75 <- predict(knnModel75, newdata = testX_pp)
knn_perf75 <- postResample(pred = knnPred75, obs = testY)
knn_perf75
##     RMSE Rsquared      MAE 
## 1.402164 0.612998 1.081500

SVM (Radial)

set.seed(624)
svmModel75 <- train(x = trainX_pp, y = trainY,
                    method = "svmRadial",
                    tuneLength = 10,
                    trControl = ctrl)
svmModel75
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 144 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 130, 130, 130, 129, 130, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE      
##     0.25  1.418641  0.4928106  1.1602366
##     0.50  1.315950  0.5449369  1.0811411
##     1.00  1.227208  0.5943924  1.0006777
##     2.00  1.187567  0.6046434  0.9738190
##     4.00  1.172256  0.5992910  0.9708168
##     8.00  1.167551  0.5916096  0.9647169
##    16.00  1.167582  0.5913448  0.9665086
##    32.00  1.167582  0.5913448  0.9665086
##    64.00  1.167582  0.5913448  0.9665086
##   128.00  1.167582  0.5913448  0.9665086
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01401989
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01401989 and C = 8.
svmPred75 <- predict(svmModel75, newdata = testX_pp)
svm_perf75 <- postResample(pred = svmPred75, obs = testY)
svm_perf75
##      RMSE  Rsquared       MAE 
## 1.1468696 0.7474025 0.7968618

Neural Network (averaged)

set.seed(624)
nnetGrid75 <- expand.grid(size = c(3, 5, 7), decay = c(0, 0.01, 0.1), bag = FALSE)
nnetModel75 <- train(x = trainX_pp, y = trainY,
                     method = "avNNet",
                     tuneGrid = nnetGrid75,
                     trControl = ctrl,
                     linout = TRUE,
                     trace = FALSE,
                     maxit = 500)
nnetModel75
## Model Averaged Neural Network 
## 
## 144 samples
##  56 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 130, 130, 130, 130, 129, 130, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE      Rsquared   MAE     
##   3     0.00   1.750410  0.4008507  1.380214
##   3     0.01   1.893041  0.3335000  1.344573
##   3     0.10   2.254623  0.3085421  1.461160
##   5     0.00   2.026213  0.2910224  1.685804
##   5     0.01   2.126827  0.2691764  1.508699
##   5     0.10   2.533999  0.3650738  1.464038
##   7     0.00   2.962152  0.2097523  2.177166
##   7     0.01   1.442462  0.4748245  1.178171
##   7     0.10   2.433232  0.3839267  1.532387
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 7, decay = 0.01 and bag = FALSE.
nnetPred75 <- predict(nnetModel75, newdata = testX_pp)
nnet_perf75 <- postResample(pred = nnetPred75, obs = testY)
nnet_perf75
##      RMSE  Rsquared       MAE 
## 1.6901751 0.4887002 1.2419971

Part (a): Performance Comparison

results_75 <- data.frame(
  Model     = c("MARS", "KNN", "SVM (Radial)", "Neural Network"),
  CV_RMSE   = c(min(marsModel75$results$RMSE),
                min(knnModel75$results$RMSE),
                min(svmModel75$results$RMSE),
                min(nnetModel75$results$RMSE)),
  Test_RMSE = c(mars_perf75["RMSE"], knn_perf75["RMSE"],
                svm_perf75["RMSE"], nnet_perf75["RMSE"]),
  Test_R2   = c(mars_perf75["Rsquared"], knn_perf75["Rsquared"],
                svm_perf75["Rsquared"], nnet_perf75["Rsquared"])
)
results_75 <- results_75 %>% arrange(Test_RMSE)
results_75
results_long <- tidyr::pivot_longer(results_75,
                                    cols = c(CV_RMSE, Test_RMSE),
                                    names_to = "Set", values_to = "RMSE")
ggplot(results_long, aes(x = reorder(Model, RMSE), y = RMSE, fill = Set)) +
  geom_col(position = "dodge") +
  geom_text(aes(label = round(RMSE, 3)), position = position_dodge(0.9),
            vjust = -0.4, size = 3) +
  labs(title = "CV vs Test RMSE by Model (Problem 7.5)",
       x = "Model", y = "RMSE") +
  theme_minimal()

Part (a) Answer: The SVM with a radial kernel generally achieves the best combination of resampling and test set performance on this dataset, followed closely by MARS. Both models significantly outperform KNN. The resampling and test RMSE values are reasonably consistent, indicating the models generalize well.

Part (b): Variable Importance

# Use the best-performing nonlinear model (SVM)
svmImp75 <- varImp(svmModel75, scale = TRUE)
plot(svmImp75, top = 20, main = "SVM – Top 20 Variable Importances")

svmImpDf <- as.data.frame(svmImp75$importance)
svmImpDf$Predictor <- rownames(svmImpDf)
svmImpDf <- svmImpDf %>% arrange(desc(Overall)) %>% head(10)
svmImpDf

Part (b) Answer: The top predictors are dominated by manufacturing process variables (those prefixed ManufacturingProcess), though a few biological material predictors (BiologicalMaterial) also appear in the top ten. This mirrors the findings from the linear model in 6.3 — process variables tend to rank highest because they are more directly controllable and exhibit stronger linear or nonlinear relationships with yield. The top ten list from the nonlinear model is broadly similar to that from the optimal linear model, though the nonlinear model may rank a few additional process variables more highly due to its ability to capture interaction and curvature effects.

Part (c): Relationships Between Top Predictors and Yield

# Extract names of top predictors unique or notable in the nonlinear model
top_preds <- svmImpDf$Predictor[1:6]

# Combine with yield for plotting
plot_data <- as.data.frame(trainX_pp[, top_preds])
plot_data$Yield <- trainY
library(tidyr)
library(ggplot2)

plot_long <- pivot_longer(plot_data,
                          cols = all_of(top_preds),
                          names_to = "Predictor",
                          values_to = "Value")

ggplot(plot_long, aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  geom_smooth(method = "loess", se = TRUE, color = "firebrick", linewidth = 0.8) +
  facet_wrap(~ Predictor, scales = "free_x", ncol = 2) +
  labs(title = "Top Predictors vs. Yield (Training Set, Scaled)",
       x = "Predictor Value (scaled)", y = "Yield") +
  theme_minimal()

Part (c) Answer: Several patterns emerge from the scatter plots:

  • Manufacturing process variables often show monotone or near-monotone relationships with yield — higher values of certain process settings are associated with higher yield, suggesting there may be an optimal operating range.
  • A few predictors display nonlinear or threshold-like relationships (e.g., yield increases steeply up to a point and then plateaus), which explains why nonlinear models outperform linear ones here.
  • Biological material predictors show more diffuse, noisy relationships with yield, consistent with the idea that biological inputs are harder to control and their effect on yield is mediated by downstream process conditions.

This information is practically valuable: process engineers could use these plots to identify target ranges for key manufacturing parameters that maximize yield, while also using biological material scores to flag batches likely to underperform before committing to a full manufacturing run. ```