The feature plot shows that X1 through X5
appear to have structured relationships with y, while
X6–X10 look flat — consistent with them being
noise predictors.
set.seed(200)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.654912 0.4779838 2.958475
## 7 3.529432 0.5118581 2.861742
## 9 3.446330 0.5425096 2.780756
## 11 3.378049 0.5723793 2.719410
## 13 3.332339 0.5953773 2.692863
## 15 3.309235 0.6111389 2.663046
## 17 3.317408 0.6201421 2.678898
## 19 3.311667 0.6333800 2.682098
## 21 3.316340 0.6407537 2.688887
## 23 3.326040 0.6491480 2.705915
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = testData$x)
knn_perf <- postResample(pred = knnPred, obs = testData$y)
knn_perf## RMSE Rsquared MAE
## 3.1750657 0.6785946 2.5443169
set.seed(200)
marsGrid <- expand.grid(degree = 1:2, nprune = seq(2, 20, by = 2))
marsModel <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv", number = 10))
marsModel## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.188280 0.3042527 3.460689
## 1 4 2.653143 0.7167280 2.128222
## 1 6 2.295006 0.7754603 1.853199
## 1 8 1.647182 0.8774867 1.299564
## 1 10 1.635035 0.8798236 1.309436
## 1 12 1.571561 0.8898750 1.253077
## 1 14 1.571673 0.8909652 1.245508
## 1 16 1.571673 0.8909652 1.245508
## 1 18 1.571673 0.8909652 1.245508
## 1 20 1.571673 0.8909652 1.245508
## 2 2 4.188280 0.3042527 3.460689
## 2 4 2.615256 0.7216809 2.128763
## 2 6 2.275048 0.7762472 1.807779
## 2 8 1.641647 0.8839822 1.288520
## 2 10 1.473254 0.9101555 1.158761
## 2 12 1.285380 0.9283193 1.033426
## 2 14 1.261797 0.9327541 1.009821
## 2 16 1.270858 0.9322465 1.009757
## 2 18 1.263778 0.9327687 1.007653
## 2 20 1.263778 0.9327687 1.007653
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
marsPred <- predict(marsModel, newdata = testData$x)
mars_perf <- postResample(pred = marsPred, obs = testData$y)
mars_perf## RMSE Rsquared MAE
## 1.1722635 0.9448890 0.9324923
set.seed(200)
svmModel <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv", number = 10))
svmModel## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.525164 0.7810576 2.010680
## 0.50 2.270567 0.7944850 1.794902
## 1.00 2.099319 0.8155594 1.659342
## 2.00 2.005858 0.8302852 1.578799
## 4.00 1.934650 0.8435677 1.528373
## 8.00 1.915653 0.8475592 1.528614
## 16.00 1.923884 0.8463090 1.535976
## 32.00 1.923884 0.8463090 1.535976
## 64.00 1.923884 0.8463090 1.535976
## 128.00 1.923884 0.8463090 1.535976
##
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 8.
svmPred <- predict(svmModel, newdata = testData$x)
svm_perf <- postResample(pred = svmPred, obs = testData$y)
svm_perf## RMSE Rsquared MAE
## 2.0541197 0.8290353 1.5586411
set.seed(200)
nnetGrid <- expand.grid(size = c(3, 5, 7), decay = c(0, 0.01, 0.1), bag = FALSE)
nnetModel <- train(x = trainingData$x,
y = trainingData$y,
method = "avNNet",
preProc = c("center", "scale"),
tuneGrid = nnetGrid,
trControl = trainControl(method = "cv", number = 10),
linout = TRUE,
trace = FALSE,
maxit = 500)
nnetModel## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 3 0.00 2.085707 0.8073166 1.680859
## 3 0.01 2.078996 0.8214700 1.674115
## 3 0.10 2.125797 0.8077971 1.673049
## 5 0.00 2.006550 0.8292321 1.562333
## 5 0.01 1.961361 0.8340717 1.546856
## 5 0.10 2.130631 0.8160619 1.690481
## 7 0.00 5.236178 0.5406043 3.289442
## 7 0.01 2.340310 0.7874806 1.926235
## 7 0.10 2.110541 0.8091507 1.681723
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5, decay = 0.01 and bag = FALSE.
nnetPred <- predict(nnetModel, newdata = testData$x)
nnet_perf <- postResample(pred = nnetPred, obs = testData$y)
nnet_perf## RMSE Rsquared MAE
## 2.0571872 0.8334507 1.5599448
results_72 <- data.frame(
Model = c("KNN", "MARS", "SVM (Radial)", "Neural Network"),
RMSE = c(knn_perf["RMSE"], mars_perf["RMSE"],
svm_perf["RMSE"], nnet_perf["RMSE"]),
Rsquared = c(knn_perf["Rsquared"], mars_perf["Rsquared"],
svm_perf["Rsquared"], nnet_perf["Rsquared"])
)
results_72 <- results_72 %>% arrange(RMSE)
results_72ggplot(results_72, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = round(RMSE, 3)), vjust = -0.4, size = 3.5) +
labs(title = "Test Set RMSE by Model (Problem 7.2)",
x = "Model", y = "RMSE") +
theme_minimal()Best model: MARS achieves the lowest test RMSE and highest R² among the four models, followed closely by SVM.
Does MARS select the informative predictors?
Yes. The variable importance plot shows that MARS assigns nonzero
importance exclusively (or overwhelmingly) to
X1–X5 — the five predictors that actually
appear in the data-generating equation — and assigns near-zero or zero
importance to the noise variables X6–X10. This
confirms that MARS successfully identifies the informative predictors
through its automatic feature selection mechanism (hinge function
pruning via GCV).
This problem uses the same preprocessing from Exercise 6.3: imputation, train/test split, and centering/scaling.
library(AppliedPredictiveModeling)
library(caret)
library(earth)
library(kernlab)
data(ChemicalManufacturingProcess)
# Separate predictors and response
yield <- ChemicalManufacturingProcess$Yield
predictors <- ChemicalManufacturingProcess[, -which(names(ChemicalManufacturingProcess) == "Yield")]
cat("Dimensions:", nrow(predictors), "rows,", ncol(predictors), "predictors\n")## Dimensions: 176 rows, 57 predictors
## Missing values: 106
set.seed(624)
# Train/test split (80/20)
trainIndex <- createDataPartition(yield, p = 0.8, list = FALSE)
trainX <- predictors[trainIndex, ]
testX <- predictors[-trainIndex, ]
trainY <- yield[trainIndex]
testY <- yield[-trainIndex]
# Pre-processing object: knnImpute + center + scale + remove near-zero variance
preProc <- preProcess(trainX,
method = c("knnImpute", "center", "scale", "nzv"))
trainX_pp <- predict(preProc, trainX)
testX_pp <- predict(preProc, testX)
cat("Post-processing: ", ncol(trainX_pp), "predictors retained\n")## Post-processing: 56 predictors retained
We train four nonlinear models using 10-fold cross-validation.
set.seed(624)
marsGrid75 <- expand.grid(degree = 1:2, nprune = seq(2, 20, by = 2))
marsModel75 <- train(x = trainX_pp, y = trainY,
method = "earth",
tuneGrid = marsGrid75,
trControl = ctrl)
marsModel75## Multivariate Adaptive Regression Spline
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 130, 130, 130, 129, 130, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.390625 0.4135889 1.0998502
## 1 4 1.238005 0.5465843 1.0186226
## 1 6 1.313102 0.5215943 1.0773153
## 1 8 1.351566 0.5000706 1.1174614
## 1 10 1.331360 0.5229992 1.0827653
## 1 12 1.351705 0.5194064 1.0811689
## 1 14 1.375461 0.5327304 1.1030033
## 1 16 1.373381 0.5352340 1.1068554
## 1 18 1.373381 0.5352340 1.1068554
## 1 20 1.373381 0.5352340 1.1068554
## 2 2 1.365801 0.4302215 1.0885493
## 2 4 1.238130 0.5438861 0.9995495
## 2 6 1.214023 0.5608052 0.9808794
## 2 8 1.118244 0.6192953 0.9267876
## 2 10 1.232499 0.5667857 0.9764749
## 2 12 1.260011 0.5524708 0.9997977
## 2 14 1.320436 0.5373915 1.0438398
## 2 16 1.304633 0.5397978 1.0044578
## 2 18 1.361062 0.4989883 1.0532158
## 2 20 1.399277 0.4848048 1.0636171
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 8 and degree = 2.
marsPred75 <- predict(marsModel75, newdata = testX_pp)
mars_perf75 <- postResample(pred = marsPred75, obs = testY)
mars_perf75## RMSE Rsquared MAE
## 1.1715413 0.7042263 0.8826463
set.seed(624)
knnModel75 <- train(x = trainX_pp, y = trainY,
method = "knn",
tuneLength = 10,
trControl = ctrl)
knnModel75## k-Nearest Neighbors
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 130, 130, 130, 129, 130, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.320716 0.4806512 1.086635
## 7 1.328711 0.4878708 1.101248
## 9 1.343512 0.4673563 1.126381
## 11 1.350832 0.4594141 1.131078
## 13 1.377998 0.4440056 1.151393
## 15 1.376341 0.4518094 1.139419
## 17 1.390919 0.4459126 1.152239
## 19 1.378572 0.4739808 1.131529
## 21 1.385286 0.4762743 1.144731
## 23 1.411143 0.4490627 1.160653
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
knnPred75 <- predict(knnModel75, newdata = testX_pp)
knn_perf75 <- postResample(pred = knnPred75, obs = testY)
knn_perf75## RMSE Rsquared MAE
## 1.402164 0.612998 1.081500
set.seed(624)
svmModel75 <- train(x = trainX_pp, y = trainY,
method = "svmRadial",
tuneLength = 10,
trControl = ctrl)
svmModel75## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 130, 130, 130, 129, 130, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.418641 0.4928106 1.1602366
## 0.50 1.315950 0.5449369 1.0811411
## 1.00 1.227208 0.5943924 1.0006777
## 2.00 1.187567 0.6046434 0.9738190
## 4.00 1.172256 0.5992910 0.9708168
## 8.00 1.167551 0.5916096 0.9647169
## 16.00 1.167582 0.5913448 0.9665086
## 32.00 1.167582 0.5913448 0.9665086
## 64.00 1.167582 0.5913448 0.9665086
## 128.00 1.167582 0.5913448 0.9665086
##
## Tuning parameter 'sigma' was held constant at a value of 0.01401989
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01401989 and C = 8.
svmPred75 <- predict(svmModel75, newdata = testX_pp)
svm_perf75 <- postResample(pred = svmPred75, obs = testY)
svm_perf75## RMSE Rsquared MAE
## 1.1468696 0.7474025 0.7968618
set.seed(624)
nnetGrid75 <- expand.grid(size = c(3, 5, 7), decay = c(0, 0.01, 0.1), bag = FALSE)
nnetModel75 <- train(x = trainX_pp, y = trainY,
method = "avNNet",
tuneGrid = nnetGrid75,
trControl = ctrl,
linout = TRUE,
trace = FALSE,
maxit = 500)
nnetModel75## Model Averaged Neural Network
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 130, 130, 130, 129, 130, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 3 0.00 1.750410 0.4008507 1.380214
## 3 0.01 1.893041 0.3335000 1.344573
## 3 0.10 2.254623 0.3085421 1.461160
## 5 0.00 2.026213 0.2910224 1.685804
## 5 0.01 2.126827 0.2691764 1.508699
## 5 0.10 2.533999 0.3650738 1.464038
## 7 0.00 2.962152 0.2097523 2.177166
## 7 0.01 1.442462 0.4748245 1.178171
## 7 0.10 2.433232 0.3839267 1.532387
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 7, decay = 0.01 and bag = FALSE.
nnetPred75 <- predict(nnetModel75, newdata = testX_pp)
nnet_perf75 <- postResample(pred = nnetPred75, obs = testY)
nnet_perf75## RMSE Rsquared MAE
## 1.6901751 0.4887002 1.2419971
results_75 <- data.frame(
Model = c("MARS", "KNN", "SVM (Radial)", "Neural Network"),
CV_RMSE = c(min(marsModel75$results$RMSE),
min(knnModel75$results$RMSE),
min(svmModel75$results$RMSE),
min(nnetModel75$results$RMSE)),
Test_RMSE = c(mars_perf75["RMSE"], knn_perf75["RMSE"],
svm_perf75["RMSE"], nnet_perf75["RMSE"]),
Test_R2 = c(mars_perf75["Rsquared"], knn_perf75["Rsquared"],
svm_perf75["Rsquared"], nnet_perf75["Rsquared"])
)
results_75 <- results_75 %>% arrange(Test_RMSE)
results_75results_long <- tidyr::pivot_longer(results_75,
cols = c(CV_RMSE, Test_RMSE),
names_to = "Set", values_to = "RMSE")
ggplot(results_long, aes(x = reorder(Model, RMSE), y = RMSE, fill = Set)) +
geom_col(position = "dodge") +
geom_text(aes(label = round(RMSE, 3)), position = position_dodge(0.9),
vjust = -0.4, size = 3) +
labs(title = "CV vs Test RMSE by Model (Problem 7.5)",
x = "Model", y = "RMSE") +
theme_minimal()Part (a) Answer: The SVM with a radial kernel generally achieves the best combination of resampling and test set performance on this dataset, followed closely by MARS. Both models significantly outperform KNN. The resampling and test RMSE values are reasonably consistent, indicating the models generalize well.
# Use the best-performing nonlinear model (SVM)
svmImp75 <- varImp(svmModel75, scale = TRUE)
plot(svmImp75, top = 20, main = "SVM – Top 20 Variable Importances")svmImpDf <- as.data.frame(svmImp75$importance)
svmImpDf$Predictor <- rownames(svmImpDf)
svmImpDf <- svmImpDf %>% arrange(desc(Overall)) %>% head(10)
svmImpDfPart (b) Answer: The top predictors are dominated by
manufacturing process variables (those prefixed
ManufacturingProcess), though a few biological material
predictors (BiologicalMaterial) also appear in the top ten.
This mirrors the findings from the linear model in 6.3 — process
variables tend to rank highest because they are more directly
controllable and exhibit stronger linear or nonlinear relationships with
yield. The top ten list from the nonlinear model is broadly similar to
that from the optimal linear model, though the nonlinear model may rank
a few additional process variables more highly due to its ability to
capture interaction and curvature effects.
# Extract names of top predictors unique or notable in the nonlinear model
top_preds <- svmImpDf$Predictor[1:6]
# Combine with yield for plotting
plot_data <- as.data.frame(trainX_pp[, top_preds])
plot_data$Yield <- trainYlibrary(tidyr)
library(ggplot2)
plot_long <- pivot_longer(plot_data,
cols = all_of(top_preds),
names_to = "Predictor",
values_to = "Value")
ggplot(plot_long, aes(x = Value, y = Yield)) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "loess", se = TRUE, color = "firebrick", linewidth = 0.8) +
facet_wrap(~ Predictor, scales = "free_x", ncol = 2) +
labs(title = "Top Predictors vs. Yield (Training Set, Scaled)",
x = "Predictor Value (scaled)", y = "Yield") +
theme_minimal()Part (c) Answer: Several patterns emerge from the scatter plots:
This information is practically valuable: process engineers could use these plots to identify target ranges for key manufacturing parameters that maximize yield, while also using biological material scores to flag batches likely to underperform before committing to a full manufacturing run. ```